diff --git a/404.html b/404.html index 617d283d5a7..e9ed7434a81 100644 --- a/404.html +++ b/404.html @@ -5,13 +5,13 @@ Page Not Found | Cumulus Documentation - +
Skip to main content

Page Not Found

We could not find what you were looking for.

Please contact the owner of the site that linked you to the original URL and let them know their link is broken.

- + \ No newline at end of file diff --git a/assets/js/1a4e3797.b5f1ebc2.js b/assets/js/1a4e3797.b5f1ebc2.js deleted file mode 100644 index acff7817709..00000000000 --- a/assets/js/1a4e3797.b5f1ebc2.js +++ /dev/null @@ -1,2 +0,0 @@ -/*! For license information please see 1a4e3797.b5f1ebc2.js.LICENSE.txt */ -(self.webpackChunk_cumulus_website=self.webpackChunk_cumulus_website||[]).push([[97920],{17331:e=>{function t(){this._events=this._events||{},this._maxListeners=this._maxListeners||void 0}function r(e){return"function"==typeof e}function n(e){return"object"==typeof e&&null!==e}function i(e){return void 0===e}e.exports=t,t.prototype._events=void 0,t.prototype._maxListeners=void 0,t.defaultMaxListeners=10,t.prototype.setMaxListeners=function(e){if("number"!=typeof e||e<0||isNaN(e))throw TypeError("n must be a positive number");return this._maxListeners=e,this},t.prototype.emit=function(e){var t,a,s,c,u,o;if(this._events||(this._events={}),"error"===e&&(!this._events.error||n(this._events.error)&&!this._events.error.length)){if((t=arguments[1])instanceof Error)throw t;var h=new Error('Uncaught, unspecified "error" event. ('+t+")");throw h.context=t,h}if(i(a=this._events[e]))return!1;if(r(a))switch(arguments.length){case 1:a.call(this);break;case 2:a.call(this,arguments[1]);break;case 3:a.call(this,arguments[1],arguments[2]);break;default:c=Array.prototype.slice.call(arguments,1),a.apply(this,c)}else if(n(a))for(c=Array.prototype.slice.call(arguments,1),s=(o=a.slice()).length,u=0;u0&&this._events[e].length>s&&(this._events[e].warned=!0,console.error("(node) warning: possible EventEmitter memory leak detected. %d listeners added. Use emitter.setMaxListeners() to increase limit.",this._events[e].length),"function"==typeof console.trace&&console.trace()),this},t.prototype.on=t.prototype.addListener,t.prototype.once=function(e,t){if(!r(t))throw TypeError("listener must be a function");var n=!1;function i(){this.removeListener(e,i),n||(n=!0,t.apply(this,arguments))}return i.listener=t,this.on(e,i),this},t.prototype.removeListener=function(e,t){var i,a,s,c;if(!r(t))throw TypeError("listener must be a function");if(!this._events||!this._events[e])return this;if(s=(i=this._events[e]).length,a=-1,i===t||r(i.listener)&&i.listener===t)delete this._events[e],this._events.removeListener&&this.emit("removeListener",e,t);else if(n(i)){for(c=s;c-- >0;)if(i[c]===t||i[c].listener&&i[c].listener===t){a=c;break}if(a<0)return this;1===i.length?(i.length=0,delete this._events[e]):i.splice(a,1),this._events.removeListener&&this.emit("removeListener",e,t)}return this},t.prototype.removeAllListeners=function(e){var t,n;if(!this._events)return this;if(!this._events.removeListener)return 0===arguments.length?this._events={}:this._events[e]&&delete this._events[e],this;if(0===arguments.length){for(t in this._events)"removeListener"!==t&&this.removeAllListeners(t);return this.removeAllListeners("removeListener"),this._events={},this}if(r(n=this._events[e]))this.removeListener(e,n);else if(n)for(;n.length;)this.removeListener(e,n[n.length-1]);return delete this._events[e],this},t.prototype.listeners=function(e){return this._events&&this._events[e]?r(this._events[e])?[this._events[e]]:this._events[e].slice():[]},t.prototype.listenerCount=function(e){if(this._events){var t=this._events[e];if(r(t))return 1;if(t)return t.length}return 0},t.listenerCount=function(e,t){return e.listenerCount(t)}},8131:(e,t,r)=>{"use strict";var n=r(49374),i=r(17775),a=r(23076);function s(e,t,r){return new n(e,t,r)}s.version=r(24336),s.AlgoliaSearchHelper=n,s.SearchParameters=i,s.SearchResults=a,e.exports=s},68078:(e,t,r)=>{"use strict";var n=r(17331);function i(e,t){this.main=e,this.fn=t,this.lastResults=null}r(14853)(i,n),i.prototype.detach=function(){this.removeAllListeners(),this.main.detachDerivedHelper(this)},i.prototype.getModifiedState=function(e){return this.fn(e)},e.exports=i},82437:(e,t,r)=>{"use strict";var n=r(52344),i=r(49803),a=r(90116),s={addRefinement:function(e,t,r){if(s.isRefined(e,t,r))return e;var i=""+r,a=e[t]?e[t].concat(i):[i],c={};return c[t]=a,n({},c,e)},removeRefinement:function(e,t,r){if(void 0===r)return s.clearRefinement(e,(function(e,r){return t===r}));var n=""+r;return s.clearRefinement(e,(function(e,r){return t===r&&n===e}))},toggleRefinement:function(e,t,r){if(void 0===r)throw new Error("toggleRefinement should be used with a value");return s.isRefined(e,t,r)?s.removeRefinement(e,t,r):s.addRefinement(e,t,r)},clearRefinement:function(e,t,r){if(void 0===t)return a(e)?{}:e;if("string"==typeof t)return i(e,[t]);if("function"==typeof t){var n=!1,s=Object.keys(e).reduce((function(i,a){var s=e[a]||[],c=s.filter((function(e){return!t(e,a,r)}));return c.length!==s.length&&(n=!0),i[a]=c,i}),{});return n?s:e}},isRefined:function(e,t,r){var n=!!e[t]&&e[t].length>0;if(void 0===r||!n)return n;var i=""+r;return-1!==e[t].indexOf(i)}};e.exports=s},17775:(e,t,r)=>{"use strict";var n=r(60185),i=r(52344),a=r(22686),s=r(7888),c=r(28023),u=r(49803),o=r(90116),h=r(46801),f=r(82437);function l(e,t){return Array.isArray(e)&&Array.isArray(t)?e.length===t.length&&e.every((function(e,r){return l(t[r],e)})):e===t}function m(e){var t=e?m._parseNumbers(e):{};void 0===t.userToken||h(t.userToken)||console.warn("[algoliasearch-helper] The `userToken` parameter is invalid. This can lead to wrong analytics.\n - Format: [a-zA-Z0-9_-]{1,64}"),this.facets=t.facets||[],this.disjunctiveFacets=t.disjunctiveFacets||[],this.hierarchicalFacets=t.hierarchicalFacets||[],this.facetsRefinements=t.facetsRefinements||{},this.facetsExcludes=t.facetsExcludes||{},this.disjunctiveFacetsRefinements=t.disjunctiveFacetsRefinements||{},this.numericRefinements=t.numericRefinements||{},this.tagRefinements=t.tagRefinements||[],this.hierarchicalFacetsRefinements=t.hierarchicalFacetsRefinements||{};var r=this;Object.keys(t).forEach((function(e){var n=-1!==m.PARAMETERS.indexOf(e),i=void 0!==t[e];!n&&i&&(r[e]=t[e])}))}m.PARAMETERS=Object.keys(new m),m._parseNumbers=function(e){if(e instanceof m)return e;var t={};if(["aroundPrecision","aroundRadius","getRankingInfo","minWordSizefor2Typos","minWordSizefor1Typo","page","maxValuesPerFacet","distinct","minimumAroundRadius","hitsPerPage","minProximity"].forEach((function(r){var n=e[r];if("string"==typeof n){var i=parseFloat(n);t[r]=isNaN(i)?n:i}})),Array.isArray(e.insideBoundingBox)&&(t.insideBoundingBox=e.insideBoundingBox.map((function(e){return Array.isArray(e)?e.map((function(e){return parseFloat(e)})):e}))),e.numericRefinements){var r={};Object.keys(e.numericRefinements).forEach((function(t){var n=e.numericRefinements[t]||{};r[t]={},Object.keys(n).forEach((function(e){var i=n[e].map((function(e){return Array.isArray(e)?e.map((function(e){return"string"==typeof e?parseFloat(e):e})):"string"==typeof e?parseFloat(e):e}));r[t][e]=i}))})),t.numericRefinements=r}return n({},e,t)},m.make=function(e){var t=new m(e);return(e.hierarchicalFacets||[]).forEach((function(e){if(e.rootPath){var r=t.getHierarchicalRefinement(e.name);r.length>0&&0!==r[0].indexOf(e.rootPath)&&(t=t.clearRefinements(e.name)),0===(r=t.getHierarchicalRefinement(e.name)).length&&(t=t.toggleHierarchicalFacetRefinement(e.name,e.rootPath))}})),t},m.validate=function(e,t){var r=t||{};return e.tagFilters&&r.tagRefinements&&r.tagRefinements.length>0?new Error("[Tags] Cannot switch from the managed tag API to the advanced API. It is probably an error, if it is really what you want, you should first clear the tags with clearTags method."):e.tagRefinements.length>0&&r.tagFilters?new Error("[Tags] Cannot switch from the advanced tag API to the managed API. It is probably an error, if it is not, you should first clear the tags with clearTags method."):e.numericFilters&&r.numericRefinements&&o(r.numericRefinements)?new Error("[Numeric filters] Can't switch from the advanced to the managed API. It is probably an error, if this is really what you want, you have to first clear the numeric filters."):o(e.numericRefinements)&&r.numericFilters?new Error("[Numeric filters] Can't switch from the managed API to the advanced. It is probably an error, if this is really what you want, you have to first clear the numeric filters."):null},m.prototype={constructor:m,clearRefinements:function(e){var t={numericRefinements:this._clearNumericRefinements(e),facetsRefinements:f.clearRefinement(this.facetsRefinements,e,"conjunctiveFacet"),facetsExcludes:f.clearRefinement(this.facetsExcludes,e,"exclude"),disjunctiveFacetsRefinements:f.clearRefinement(this.disjunctiveFacetsRefinements,e,"disjunctiveFacet"),hierarchicalFacetsRefinements:f.clearRefinement(this.hierarchicalFacetsRefinements,e,"hierarchicalFacet")};return t.numericRefinements===this.numericRefinements&&t.facetsRefinements===this.facetsRefinements&&t.facetsExcludes===this.facetsExcludes&&t.disjunctiveFacetsRefinements===this.disjunctiveFacetsRefinements&&t.hierarchicalFacetsRefinements===this.hierarchicalFacetsRefinements?this:this.setQueryParameters(t)},clearTags:function(){return void 0===this.tagFilters&&0===this.tagRefinements.length?this:this.setQueryParameters({tagFilters:void 0,tagRefinements:[]})},setIndex:function(e){return e===this.index?this:this.setQueryParameters({index:e})},setQuery:function(e){return e===this.query?this:this.setQueryParameters({query:e})},setPage:function(e){return e===this.page?this:this.setQueryParameters({page:e})},setFacets:function(e){return this.setQueryParameters({facets:e})},setDisjunctiveFacets:function(e){return this.setQueryParameters({disjunctiveFacets:e})},setHitsPerPage:function(e){return this.hitsPerPage===e?this:this.setQueryParameters({hitsPerPage:e})},setTypoTolerance:function(e){return this.typoTolerance===e?this:this.setQueryParameters({typoTolerance:e})},addNumericRefinement:function(e,t,r){var i=c(r);if(this.isNumericRefined(e,t,i))return this;var a=n({},this.numericRefinements);return a[e]=n({},a[e]),a[e][t]?(a[e][t]=a[e][t].slice(),a[e][t].push(i)):a[e][t]=[i],this.setQueryParameters({numericRefinements:a})},getConjunctiveRefinements:function(e){return this.isConjunctiveFacet(e)&&this.facetsRefinements[e]||[]},getDisjunctiveRefinements:function(e){return this.isDisjunctiveFacet(e)&&this.disjunctiveFacetsRefinements[e]||[]},getHierarchicalRefinement:function(e){return this.hierarchicalFacetsRefinements[e]||[]},getExcludeRefinements:function(e){return this.isConjunctiveFacet(e)&&this.facetsExcludes[e]||[]},removeNumericRefinement:function(e,t,r){return void 0!==r?this.isNumericRefined(e,t,r)?this.setQueryParameters({numericRefinements:this._clearNumericRefinements((function(n,i){return i===e&&n.op===t&&l(n.val,c(r))}))}):this:void 0!==t?this.isNumericRefined(e,t)?this.setQueryParameters({numericRefinements:this._clearNumericRefinements((function(r,n){return n===e&&r.op===t}))}):this:this.isNumericRefined(e)?this.setQueryParameters({numericRefinements:this._clearNumericRefinements((function(t,r){return r===e}))}):this},getNumericRefinements:function(e){return this.numericRefinements[e]||{}},getNumericRefinement:function(e,t){return this.numericRefinements[e]&&this.numericRefinements[e][t]},_clearNumericRefinements:function(e){if(void 0===e)return o(this.numericRefinements)?{}:this.numericRefinements;if("string"==typeof e)return u(this.numericRefinements,[e]);if("function"==typeof e){var t=!1,r=this.numericRefinements,n=Object.keys(r).reduce((function(n,i){var a=r[i],s={};return a=a||{},Object.keys(a).forEach((function(r){var n=a[r]||[],c=[];n.forEach((function(t){e({val:t,op:r},i,"numeric")||c.push(t)})),c.length!==n.length&&(t=!0),s[r]=c})),n[i]=s,n}),{});return t?n:this.numericRefinements}},addFacet:function(e){return this.isConjunctiveFacet(e)?this:this.setQueryParameters({facets:this.facets.concat([e])})},addDisjunctiveFacet:function(e){return this.isDisjunctiveFacet(e)?this:this.setQueryParameters({disjunctiveFacets:this.disjunctiveFacets.concat([e])})},addHierarchicalFacet:function(e){if(this.isHierarchicalFacet(e.name))throw new Error("Cannot declare two hierarchical facets with the same name: `"+e.name+"`");return this.setQueryParameters({hierarchicalFacets:this.hierarchicalFacets.concat([e])})},addFacetRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return f.isRefined(this.facetsRefinements,e,t)?this:this.setQueryParameters({facetsRefinements:f.addRefinement(this.facetsRefinements,e,t)})},addExcludeRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return f.isRefined(this.facetsExcludes,e,t)?this:this.setQueryParameters({facetsExcludes:f.addRefinement(this.facetsExcludes,e,t)})},addDisjunctiveFacetRefinement:function(e,t){if(!this.isDisjunctiveFacet(e))throw new Error(e+" is not defined in the disjunctiveFacets attribute of the helper configuration");return f.isRefined(this.disjunctiveFacetsRefinements,e,t)?this:this.setQueryParameters({disjunctiveFacetsRefinements:f.addRefinement(this.disjunctiveFacetsRefinements,e,t)})},addTagRefinement:function(e){if(this.isTagRefined(e))return this;var t={tagRefinements:this.tagRefinements.concat(e)};return this.setQueryParameters(t)},removeFacet:function(e){return this.isConjunctiveFacet(e)?this.clearRefinements(e).setQueryParameters({facets:this.facets.filter((function(t){return t!==e}))}):this},removeDisjunctiveFacet:function(e){return this.isDisjunctiveFacet(e)?this.clearRefinements(e).setQueryParameters({disjunctiveFacets:this.disjunctiveFacets.filter((function(t){return t!==e}))}):this},removeHierarchicalFacet:function(e){return this.isHierarchicalFacet(e)?this.clearRefinements(e).setQueryParameters({hierarchicalFacets:this.hierarchicalFacets.filter((function(t){return t.name!==e}))}):this},removeFacetRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return f.isRefined(this.facetsRefinements,e,t)?this.setQueryParameters({facetsRefinements:f.removeRefinement(this.facetsRefinements,e,t)}):this},removeExcludeRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return f.isRefined(this.facetsExcludes,e,t)?this.setQueryParameters({facetsExcludes:f.removeRefinement(this.facetsExcludes,e,t)}):this},removeDisjunctiveFacetRefinement:function(e,t){if(!this.isDisjunctiveFacet(e))throw new Error(e+" is not defined in the disjunctiveFacets attribute of the helper configuration");return f.isRefined(this.disjunctiveFacetsRefinements,e,t)?this.setQueryParameters({disjunctiveFacetsRefinements:f.removeRefinement(this.disjunctiveFacetsRefinements,e,t)}):this},removeTagRefinement:function(e){if(!this.isTagRefined(e))return this;var t={tagRefinements:this.tagRefinements.filter((function(t){return t!==e}))};return this.setQueryParameters(t)},toggleRefinement:function(e,t){return this.toggleFacetRefinement(e,t)},toggleFacetRefinement:function(e,t){if(this.isHierarchicalFacet(e))return this.toggleHierarchicalFacetRefinement(e,t);if(this.isConjunctiveFacet(e))return this.toggleConjunctiveFacetRefinement(e,t);if(this.isDisjunctiveFacet(e))return this.toggleDisjunctiveFacetRefinement(e,t);throw new Error("Cannot refine the undeclared facet "+e+"; it should be added to the helper options facets, disjunctiveFacets or hierarchicalFacets")},toggleConjunctiveFacetRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return this.setQueryParameters({facetsRefinements:f.toggleRefinement(this.facetsRefinements,e,t)})},toggleExcludeFacetRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return this.setQueryParameters({facetsExcludes:f.toggleRefinement(this.facetsExcludes,e,t)})},toggleDisjunctiveFacetRefinement:function(e,t){if(!this.isDisjunctiveFacet(e))throw new Error(e+" is not defined in the disjunctiveFacets attribute of the helper configuration");return this.setQueryParameters({disjunctiveFacetsRefinements:f.toggleRefinement(this.disjunctiveFacetsRefinements,e,t)})},toggleHierarchicalFacetRefinement:function(e,t){if(!this.isHierarchicalFacet(e))throw new Error(e+" is not defined in the hierarchicalFacets attribute of the helper configuration");var r=this._getHierarchicalFacetSeparator(this.getHierarchicalFacetByName(e)),n={};return void 0!==this.hierarchicalFacetsRefinements[e]&&this.hierarchicalFacetsRefinements[e].length>0&&(this.hierarchicalFacetsRefinements[e][0]===t||0===this.hierarchicalFacetsRefinements[e][0].indexOf(t+r))?-1===t.indexOf(r)?n[e]=[]:n[e]=[t.slice(0,t.lastIndexOf(r))]:n[e]=[t],this.setQueryParameters({hierarchicalFacetsRefinements:i({},n,this.hierarchicalFacetsRefinements)})},addHierarchicalFacetRefinement:function(e,t){if(this.isHierarchicalFacetRefined(e))throw new Error(e+" is already refined.");if(!this.isHierarchicalFacet(e))throw new Error(e+" is not defined in the hierarchicalFacets attribute of the helper configuration.");var r={};return r[e]=[t],this.setQueryParameters({hierarchicalFacetsRefinements:i({},r,this.hierarchicalFacetsRefinements)})},removeHierarchicalFacetRefinement:function(e){if(!this.isHierarchicalFacetRefined(e))return this;var t={};return t[e]=[],this.setQueryParameters({hierarchicalFacetsRefinements:i({},t,this.hierarchicalFacetsRefinements)})},toggleTagRefinement:function(e){return this.isTagRefined(e)?this.removeTagRefinement(e):this.addTagRefinement(e)},isDisjunctiveFacet:function(e){return this.disjunctiveFacets.indexOf(e)>-1},isHierarchicalFacet:function(e){return void 0!==this.getHierarchicalFacetByName(e)},isConjunctiveFacet:function(e){return this.facets.indexOf(e)>-1},isFacetRefined:function(e,t){return!!this.isConjunctiveFacet(e)&&f.isRefined(this.facetsRefinements,e,t)},isExcludeRefined:function(e,t){return!!this.isConjunctiveFacet(e)&&f.isRefined(this.facetsExcludes,e,t)},isDisjunctiveFacetRefined:function(e,t){return!!this.isDisjunctiveFacet(e)&&f.isRefined(this.disjunctiveFacetsRefinements,e,t)},isHierarchicalFacetRefined:function(e,t){if(!this.isHierarchicalFacet(e))return!1;var r=this.getHierarchicalRefinement(e);return t?-1!==r.indexOf(t):r.length>0},isNumericRefined:function(e,t,r){if(void 0===r&&void 0===t)return!!this.numericRefinements[e];var n=this.numericRefinements[e]&&void 0!==this.numericRefinements[e][t];if(void 0===r||!n)return n;var i,a,u=c(r),o=void 0!==(i=this.numericRefinements[e][t],a=u,s(i,(function(e){return l(e,a)})));return n&&o},isTagRefined:function(e){return-1!==this.tagRefinements.indexOf(e)},getRefinedDisjunctiveFacets:function(){var e=this,t=a(Object.keys(this.numericRefinements).filter((function(t){return Object.keys(e.numericRefinements[t]).length>0})),this.disjunctiveFacets);return Object.keys(this.disjunctiveFacetsRefinements).filter((function(t){return e.disjunctiveFacetsRefinements[t].length>0})).concat(t).concat(this.getRefinedHierarchicalFacets())},getRefinedHierarchicalFacets:function(){var e=this;return a(this.hierarchicalFacets.map((function(e){return e.name})),Object.keys(this.hierarchicalFacetsRefinements).filter((function(t){return e.hierarchicalFacetsRefinements[t].length>0})))},getUnrefinedDisjunctiveFacets:function(){var e=this.getRefinedDisjunctiveFacets();return this.disjunctiveFacets.filter((function(t){return-1===e.indexOf(t)}))},managedParameters:["index","facets","disjunctiveFacets","facetsRefinements","hierarchicalFacets","facetsExcludes","disjunctiveFacetsRefinements","numericRefinements","tagRefinements","hierarchicalFacetsRefinements"],getQueryParams:function(){var e=this.managedParameters,t={},r=this;return Object.keys(this).forEach((function(n){var i=r[n];-1===e.indexOf(n)&&void 0!==i&&(t[n]=i)})),t},setQueryParameter:function(e,t){if(this[e]===t)return this;var r={};return r[e]=t,this.setQueryParameters(r)},setQueryParameters:function(e){if(!e)return this;var t=m.validate(this,e);if(t)throw t;var r=this,n=m._parseNumbers(e),i=Object.keys(this).reduce((function(e,t){return e[t]=r[t],e}),{}),a=Object.keys(n).reduce((function(e,t){var r=void 0!==e[t],i=void 0!==n[t];return r&&!i?u(e,[t]):(i&&(e[t]=n[t]),e)}),i);return new this.constructor(a)},resetPage:function(){return void 0===this.page?this:this.setPage(0)},_getHierarchicalFacetSortBy:function(e){return e.sortBy||["isRefined:desc","name:asc"]},_getHierarchicalFacetSeparator:function(e){return e.separator||" > "},_getHierarchicalRootPath:function(e){return e.rootPath||null},_getHierarchicalShowParentLevel:function(e){return"boolean"!=typeof e.showParentLevel||e.showParentLevel},getHierarchicalFacetByName:function(e){return s(this.hierarchicalFacets,(function(t){return t.name===e}))},getHierarchicalFacetBreadcrumb:function(e){if(!this.isHierarchicalFacet(e))return[];var t=this.getHierarchicalRefinement(e)[0];if(!t)return[];var r=this._getHierarchicalFacetSeparator(this.getHierarchicalFacetByName(e));return t.split(r).map((function(e){return e.trim()}))},toString:function(){return JSON.stringify(this,null,2)}},e.exports=m},10210:(e,t,r)=>{"use strict";e.exports=function(e){return function(t,r){var s=e.hierarchicalFacets[r],o=e.hierarchicalFacetsRefinements[s.name]&&e.hierarchicalFacetsRefinements[s.name][0]||"",h=e._getHierarchicalFacetSeparator(s),f=e._getHierarchicalRootPath(s),l=e._getHierarchicalShowParentLevel(s),m=a(e._getHierarchicalFacetSortBy(s)),d=t.every((function(e){return e.exhaustive})),p=function(e,t,r,a,s){return function(o,h,f){var l=o;if(f>0){var m=0;for(l=o;m{"use strict";var n=r(60185),i=r(52344),a=r(42148),s=r(74587),c=r(7888),u=r(69725),o=r(82293),h=r(94039),f=h.escapeFacetValue,l=h.unescapeFacetValue,m=r(10210);function d(e){var t={};return e.forEach((function(e,r){t[e]=r})),t}function p(e,t,r){t&&t[r]&&(e.stats=t[r])}function v(e,t,r){var a=t[0];this._rawResults=t;var o=this;Object.keys(a).forEach((function(e){o[e]=a[e]})),Object.keys(r||{}).forEach((function(e){o[e]=r[e]})),this.processingTimeMS=t.reduce((function(e,t){return void 0===t.processingTimeMS?e:e+t.processingTimeMS}),0),this.disjunctiveFacets=[],this.hierarchicalFacets=e.hierarchicalFacets.map((function(){return[]})),this.facets=[];var h=e.getRefinedDisjunctiveFacets(),f=d(e.facets),v=d(e.disjunctiveFacets),g=1,y=a.facets||{};Object.keys(y).forEach((function(t){var r,n,i=y[t],s=(r=e.hierarchicalFacets,n=t,c(r,(function(e){return(e.attributes||[]).indexOf(n)>-1})));if(s){var h=s.attributes.indexOf(t),l=u(e.hierarchicalFacets,(function(e){return e.name===s.name}));o.hierarchicalFacets[l][h]={attribute:t,data:i,exhaustive:a.exhaustiveFacetsCount}}else{var m,d=-1!==e.disjunctiveFacets.indexOf(t),g=-1!==e.facets.indexOf(t);d&&(m=v[t],o.disjunctiveFacets[m]={name:t,data:i,exhaustive:a.exhaustiveFacetsCount},p(o.disjunctiveFacets[m],a.facets_stats,t)),g&&(m=f[t],o.facets[m]={name:t,data:i,exhaustive:a.exhaustiveFacetsCount},p(o.facets[m],a.facets_stats,t))}})),this.hierarchicalFacets=s(this.hierarchicalFacets),h.forEach((function(r){var s=t[g],c=s&&s.facets?s.facets:{},h=e.getHierarchicalFacetByName(r);Object.keys(c).forEach((function(t){var r,f=c[t];if(h){r=u(e.hierarchicalFacets,(function(e){return e.name===h.name}));var m=u(o.hierarchicalFacets[r],(function(e){return e.attribute===t}));if(-1===m)return;o.hierarchicalFacets[r][m].data=n({},o.hierarchicalFacets[r][m].data,f)}else{r=v[t];var d=a.facets&&a.facets[t]||{};o.disjunctiveFacets[r]={name:t,data:i({},f,d),exhaustive:s.exhaustiveFacetsCount},p(o.disjunctiveFacets[r],s.facets_stats,t),e.disjunctiveFacetsRefinements[t]&&e.disjunctiveFacetsRefinements[t].forEach((function(n){!o.disjunctiveFacets[r].data[n]&&e.disjunctiveFacetsRefinements[t].indexOf(l(n))>-1&&(o.disjunctiveFacets[r].data[n]=0)}))}})),g++})),e.getRefinedHierarchicalFacets().forEach((function(r){var n=e.getHierarchicalFacetByName(r),a=e._getHierarchicalFacetSeparator(n),s=e.getHierarchicalRefinement(r);0===s.length||s[0].split(a).length<2||t.slice(g).forEach((function(t){var r=t&&t.facets?t.facets:{};Object.keys(r).forEach((function(t){var c=r[t],h=u(e.hierarchicalFacets,(function(e){return e.name===n.name})),f=u(o.hierarchicalFacets[h],(function(e){return e.attribute===t}));if(-1!==f){var l={};if(s.length>0){var m=s[0].split(a)[0];l[m]=o.hierarchicalFacets[h][f].data[m]}o.hierarchicalFacets[h][f].data=i(l,c,o.hierarchicalFacets[h][f].data)}})),g++}))})),Object.keys(e.facetsExcludes).forEach((function(t){var r=e.facetsExcludes[t],n=f[t];o.facets[n]={name:t,data:y[t],exhaustive:a.exhaustiveFacetsCount},r.forEach((function(e){o.facets[n]=o.facets[n]||{name:t},o.facets[n].data=o.facets[n].data||{},o.facets[n].data[e]=0}))})),this.hierarchicalFacets=this.hierarchicalFacets.map(m(e)),this.facets=s(this.facets),this.disjunctiveFacets=s(this.disjunctiveFacets),this._state=e}function g(e,t){function r(e){return e.name===t}if(e._state.isConjunctiveFacet(t)){var n=c(e.facets,r);return n?Object.keys(n.data).map((function(r){var i=f(r);return{name:r,escapedValue:i,count:n.data[r],isRefined:e._state.isFacetRefined(t,i),isExcluded:e._state.isExcludeRefined(t,r)}})):[]}if(e._state.isDisjunctiveFacet(t)){var i=c(e.disjunctiveFacets,r);return i?Object.keys(i.data).map((function(r){var n=f(r);return{name:r,escapedValue:n,count:i.data[r],isRefined:e._state.isDisjunctiveFacetRefined(t,n)}})):[]}if(e._state.isHierarchicalFacet(t)){var a=c(e.hierarchicalFacets,r);if(!a)return a;var s=e._state.getHierarchicalFacetByName(t),u=l(e._state.getHierarchicalRefinement(t)[0]||"").split(e._state._getHierarchicalFacetSeparator(s));return u.unshift(t),y(a,u,0),a}}function y(e,t,r){e.isRefined=e.name===t[r],e.data&&e.data.forEach((function(e){y(e,t,r+1)}))}function R(e,t,r,n){if(n=n||0,Array.isArray(t))return e(t,r[n]);if(!t.data||0===t.data.length)return t;var a=t.data.map((function(t){return R(e,t,r,n+1)})),s=e(a,r[n]);return i({data:s},t)}function F(e,t){var r=c(e,(function(e){return e.name===t}));return r&&r.stats}function b(e,t,r,n,i){var a=c(i,(function(e){return e.name===r})),s=a&&a.data&&a.data[n]?a.data[n]:0,u=a&&a.exhaustive||!1;return{type:t,attributeName:r,name:n,count:s,exhaustive:u}}v.prototype.getFacetByName=function(e){function t(t){return t.name===e}return c(this.facets,t)||c(this.disjunctiveFacets,t)||c(this.hierarchicalFacets,t)},v.DEFAULT_SORT=["isRefined:desc","count:desc","name:asc"],v.prototype.getFacetValues=function(e,t){var r=g(this,e);if(r){var n,s=i({},t,{sortBy:v.DEFAULT_SORT,facetOrdering:!(t&&t.sortBy)}),c=this;if(Array.isArray(r))n=[e];else n=c._state.getHierarchicalFacetByName(r.name).attributes;return R((function(e,t){if(s.facetOrdering){var r=function(e,t){return e.renderingContent&&e.renderingContent.facetOrdering&&e.renderingContent.facetOrdering.values&&e.renderingContent.facetOrdering.values[t]}(c,t);if(Boolean(r))return function(e,t){var r=[],n=[],i=(t.order||[]).reduce((function(e,t,r){return e[t]=r,e}),{});e.forEach((function(e){var t=e.path||e.name;void 0!==i[t]?r[i[t]]=e:n.push(e)})),r=r.filter((function(e){return e}));var s,c=t.sortRemainingBy;return"hidden"===c?r:(s="alpha"===c?[["path","name"],["asc","asc"]]:[["count"],["desc"]],r.concat(a(n,s[0],s[1])))}(e,r)}if(Array.isArray(s.sortBy)){var n=o(s.sortBy,v.DEFAULT_SORT);return a(e,n[0],n[1])}if("function"==typeof s.sortBy)return function(e,t){return t.sort(e)}(s.sortBy,e);throw new Error("options.sortBy is optional but if defined it must be either an array of string (predicates) or a sorting function")}),r,n)}},v.prototype.getFacetStats=function(e){return this._state.isConjunctiveFacet(e)?F(this.facets,e):this._state.isDisjunctiveFacet(e)?F(this.disjunctiveFacets,e):void 0},v.prototype.getRefinements=function(){var e=this._state,t=this,r=[];return Object.keys(e.facetsRefinements).forEach((function(n){e.facetsRefinements[n].forEach((function(i){r.push(b(e,"facet",n,i,t.facets))}))})),Object.keys(e.facetsExcludes).forEach((function(n){e.facetsExcludes[n].forEach((function(i){r.push(b(e,"exclude",n,i,t.facets))}))})),Object.keys(e.disjunctiveFacetsRefinements).forEach((function(n){e.disjunctiveFacetsRefinements[n].forEach((function(i){r.push(b(e,"disjunctive",n,i,t.disjunctiveFacets))}))})),Object.keys(e.hierarchicalFacetsRefinements).forEach((function(n){e.hierarchicalFacetsRefinements[n].forEach((function(i){r.push(function(e,t,r,n){var i=e.getHierarchicalFacetByName(t),a=e._getHierarchicalFacetSeparator(i),s=r.split(a),u=c(n,(function(e){return e.name===t})),o=s.reduce((function(e,t){var r=e&&c(e.data,(function(e){return e.name===t}));return void 0!==r?r:e}),u),h=o&&o.count||0,f=o&&o.exhaustive||!1,l=o&&o.path||"";return{type:"hierarchical",attributeName:t,name:l,count:h,exhaustive:f}}(e,n,i,t.hierarchicalFacets))}))})),Object.keys(e.numericRefinements).forEach((function(t){var n=e.numericRefinements[t];Object.keys(n).forEach((function(e){n[e].forEach((function(n){r.push({type:"numeric",attributeName:t,name:n,numericValue:n,operator:e})}))}))})),e.tagRefinements.forEach((function(e){r.push({type:"tag",attributeName:"_tags",name:e})})),r},e.exports=v},49374:(e,t,r)=>{"use strict";var n=r(17775),i=r(23076),a=r(68078),s=r(96394),c=r(17331),u=r(14853),o=r(90116),h=r(49803),f=r(60185),l=r(24336),m=r(94039).escapeFacetValue;function d(e,t,r){"function"==typeof e.addAlgoliaAgent&&e.addAlgoliaAgent("JS Helper ("+l+")"),this.setClient(e);var i=r||{};i.index=t,this.state=n.make(i),this.lastResults=null,this._queryId=0,this._lastQueryIdReceived=-1,this.derivedHelpers=[],this._currentNbQueries=0}function p(e){if(e<0)throw new Error("Page requested below 0.");return this._change({state:this.state.setPage(e),isPageReset:!1}),this}function v(){return this.state.page}u(d,c),d.prototype.search=function(){return this._search({onlyWithDerivedHelpers:!1}),this},d.prototype.searchOnlyWithDerivedHelpers=function(){return this._search({onlyWithDerivedHelpers:!0}),this},d.prototype.getQuery=function(){var e=this.state;return s._getHitsSearchParams(e)},d.prototype.searchOnce=function(e,t){var r=e?this.state.setQueryParameters(e):this.state,n=s._getQueries(r.index,r),a=this;if(this._currentNbQueries++,this.emit("searchOnce",{state:r}),!t)return this.client.search(n).then((function(e){return a._currentNbQueries--,0===a._currentNbQueries&&a.emit("searchQueueEmpty"),{content:new i(r,e.results),state:r,_originalResponse:e}}),(function(e){throw a._currentNbQueries--,0===a._currentNbQueries&&a.emit("searchQueueEmpty"),e}));this.client.search(n).then((function(e){a._currentNbQueries--,0===a._currentNbQueries&&a.emit("searchQueueEmpty"),t(null,new i(r,e.results),r)})).catch((function(e){a._currentNbQueries--,0===a._currentNbQueries&&a.emit("searchQueueEmpty"),t(e,null,r)}))},d.prototype.findAnswers=function(e){console.warn("[algoliasearch-helper] answers is no longer supported");var t=this.state,r=this.derivedHelpers[0];if(!r)return Promise.resolve([]);var n=r.getModifiedState(t),i=f({attributesForPrediction:e.attributesForPrediction,nbHits:e.nbHits},{params:h(s._getHitsSearchParams(n),["attributesToSnippet","hitsPerPage","restrictSearchableAttributes","snippetEllipsisText"])}),a="search for answers was called, but this client does not have a function client.initIndex(index).findAnswers";if("function"!=typeof this.client.initIndex)throw new Error(a);var c=this.client.initIndex(n.index);if("function"!=typeof c.findAnswers)throw new Error(a);return c.findAnswers(n.query,e.queryLanguages,i)},d.prototype.searchForFacetValues=function(e,t,r,n){var i="function"==typeof this.client.searchForFacetValues,a="function"==typeof this.client.initIndex;if(!i&&!a&&"function"!=typeof this.client.search)throw new Error("search for facet values (searchable) was called, but this client does not have a function client.searchForFacetValues or client.initIndex(index).searchForFacetValues");var c=this.state.setQueryParameters(n||{}),u=c.isDisjunctiveFacet(e),o=s.getSearchForFacetQuery(e,t,r,c);this._currentNbQueries++;var h,f=this;return i?h=this.client.searchForFacetValues([{indexName:c.index,params:o}]):a?h=this.client.initIndex(c.index).searchForFacetValues(o):(delete o.facetName,h=this.client.search([{type:"facet",facet:e,indexName:c.index,params:o}]).then((function(e){return e.results[0]}))),this.emit("searchForFacetValues",{state:c,facet:e,query:t}),h.then((function(t){return f._currentNbQueries--,0===f._currentNbQueries&&f.emit("searchQueueEmpty"),(t=Array.isArray(t)?t[0]:t).facetHits.forEach((function(t){t.escapedValue=m(t.value),t.isRefined=u?c.isDisjunctiveFacetRefined(e,t.escapedValue):c.isFacetRefined(e,t.escapedValue)})),t}),(function(e){throw f._currentNbQueries--,0===f._currentNbQueries&&f.emit("searchQueueEmpty"),e}))},d.prototype.setQuery=function(e){return this._change({state:this.state.resetPage().setQuery(e),isPageReset:!0}),this},d.prototype.clearRefinements=function(e){return this._change({state:this.state.resetPage().clearRefinements(e),isPageReset:!0}),this},d.prototype.clearTags=function(){return this._change({state:this.state.resetPage().clearTags(),isPageReset:!0}),this},d.prototype.addDisjunctiveFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().addDisjunctiveFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.addDisjunctiveRefine=function(){return this.addDisjunctiveFacetRefinement.apply(this,arguments)},d.prototype.addHierarchicalFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().addHierarchicalFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.addNumericRefinement=function(e,t,r){return this._change({state:this.state.resetPage().addNumericRefinement(e,t,r),isPageReset:!0}),this},d.prototype.addFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().addFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.addRefine=function(){return this.addFacetRefinement.apply(this,arguments)},d.prototype.addFacetExclusion=function(e,t){return this._change({state:this.state.resetPage().addExcludeRefinement(e,t),isPageReset:!0}),this},d.prototype.addExclude=function(){return this.addFacetExclusion.apply(this,arguments)},d.prototype.addTag=function(e){return this._change({state:this.state.resetPage().addTagRefinement(e),isPageReset:!0}),this},d.prototype.removeNumericRefinement=function(e,t,r){return this._change({state:this.state.resetPage().removeNumericRefinement(e,t,r),isPageReset:!0}),this},d.prototype.removeDisjunctiveFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().removeDisjunctiveFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.removeDisjunctiveRefine=function(){return this.removeDisjunctiveFacetRefinement.apply(this,arguments)},d.prototype.removeHierarchicalFacetRefinement=function(e){return this._change({state:this.state.resetPage().removeHierarchicalFacetRefinement(e),isPageReset:!0}),this},d.prototype.removeFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().removeFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.removeRefine=function(){return this.removeFacetRefinement.apply(this,arguments)},d.prototype.removeFacetExclusion=function(e,t){return this._change({state:this.state.resetPage().removeExcludeRefinement(e,t),isPageReset:!0}),this},d.prototype.removeExclude=function(){return this.removeFacetExclusion.apply(this,arguments)},d.prototype.removeTag=function(e){return this._change({state:this.state.resetPage().removeTagRefinement(e),isPageReset:!0}),this},d.prototype.toggleFacetExclusion=function(e,t){return this._change({state:this.state.resetPage().toggleExcludeFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.toggleExclude=function(){return this.toggleFacetExclusion.apply(this,arguments)},d.prototype.toggleRefinement=function(e,t){return this.toggleFacetRefinement(e,t)},d.prototype.toggleFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().toggleFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.toggleRefine=function(){return this.toggleFacetRefinement.apply(this,arguments)},d.prototype.toggleTag=function(e){return this._change({state:this.state.resetPage().toggleTagRefinement(e),isPageReset:!0}),this},d.prototype.nextPage=function(){var e=this.state.page||0;return this.setPage(e+1)},d.prototype.previousPage=function(){var e=this.state.page||0;return this.setPage(e-1)},d.prototype.setCurrentPage=p,d.prototype.setPage=p,d.prototype.setIndex=function(e){return this._change({state:this.state.resetPage().setIndex(e),isPageReset:!0}),this},d.prototype.setQueryParameter=function(e,t){return this._change({state:this.state.resetPage().setQueryParameter(e,t),isPageReset:!0}),this},d.prototype.setState=function(e){return this._change({state:n.make(e),isPageReset:!1}),this},d.prototype.overrideStateWithoutTriggeringChangeEvent=function(e){return this.state=new n(e),this},d.prototype.hasRefinements=function(e){return!!o(this.state.getNumericRefinements(e))||(this.state.isConjunctiveFacet(e)?this.state.isFacetRefined(e):this.state.isDisjunctiveFacet(e)?this.state.isDisjunctiveFacetRefined(e):!!this.state.isHierarchicalFacet(e)&&this.state.isHierarchicalFacetRefined(e))},d.prototype.isExcluded=function(e,t){return this.state.isExcludeRefined(e,t)},d.prototype.isDisjunctiveRefined=function(e,t){return this.state.isDisjunctiveFacetRefined(e,t)},d.prototype.hasTag=function(e){return this.state.isTagRefined(e)},d.prototype.isTagRefined=function(){return this.hasTagRefinements.apply(this,arguments)},d.prototype.getIndex=function(){return this.state.index},d.prototype.getCurrentPage=v,d.prototype.getPage=v,d.prototype.getTags=function(){return this.state.tagRefinements},d.prototype.getRefinements=function(e){var t=[];if(this.state.isConjunctiveFacet(e))this.state.getConjunctiveRefinements(e).forEach((function(e){t.push({value:e,type:"conjunctive"})})),this.state.getExcludeRefinements(e).forEach((function(e){t.push({value:e,type:"exclude"})}));else if(this.state.isDisjunctiveFacet(e)){this.state.getDisjunctiveRefinements(e).forEach((function(e){t.push({value:e,type:"disjunctive"})}))}var r=this.state.getNumericRefinements(e);return Object.keys(r).forEach((function(e){var n=r[e];t.push({value:n,operator:e,type:"numeric"})})),t},d.prototype.getNumericRefinement=function(e,t){return this.state.getNumericRefinement(e,t)},d.prototype.getHierarchicalFacetBreadcrumb=function(e){return this.state.getHierarchicalFacetBreadcrumb(e)},d.prototype._search=function(e){var t=this.state,r=[],n=[];e.onlyWithDerivedHelpers||(n=s._getQueries(t.index,t),r.push({state:t,queriesCount:n.length,helper:this}),this.emit("search",{state:t,results:this.lastResults}));var i=this.derivedHelpers.map((function(e){var n=e.getModifiedState(t),i=n.index?s._getQueries(n.index,n):[];return r.push({state:n,queriesCount:i.length,helper:e}),e.emit("search",{state:n,results:e.lastResults}),i})),a=Array.prototype.concat.apply(n,i),c=this._queryId++;if(this._currentNbQueries++,!a.length)return Promise.resolve({results:[]}).then(this._dispatchAlgoliaResponse.bind(this,r,c));try{this.client.search(a).then(this._dispatchAlgoliaResponse.bind(this,r,c)).catch(this._dispatchAlgoliaError.bind(this,c))}catch(u){this.emit("error",{error:u})}},d.prototype._dispatchAlgoliaResponse=function(e,t,r){if(!(t0},d.prototype._change=function(e){var t=e.state,r=e.isPageReset;t!==this.state&&(this.state=t,this.emit("change",{state:this.state,results:this.lastResults,isPageReset:r}))},d.prototype.clearCache=function(){return this.client.clearCache&&this.client.clearCache(),this},d.prototype.setClient=function(e){return this.client===e||("function"==typeof e.addAlgoliaAgent&&e.addAlgoliaAgent("JS Helper ("+l+")"),this.client=e),this},d.prototype.getClient=function(){return this.client},d.prototype.derive=function(e){var t=new a(this,e);return this.derivedHelpers.push(t),t},d.prototype.detachDerivedHelper=function(e){var t=this.derivedHelpers.indexOf(e);if(-1===t)throw new Error("Derived helper already detached");this.derivedHelpers.splice(t,1)},d.prototype.hasPendingRequests=function(){return this._currentNbQueries>0},e.exports=d},74587:e=>{"use strict";e.exports=function(e){return Array.isArray(e)?e.filter(Boolean):[]}},52344:e=>{"use strict";e.exports=function(){return Array.prototype.slice.call(arguments).reduceRight((function(e,t){return Object.keys(Object(t)).forEach((function(r){void 0!==t[r]&&(void 0!==e[r]&&delete e[r],e[r]=t[r])})),e}),{})}},94039:e=>{"use strict";e.exports={escapeFacetValue:function(e){return"string"!=typeof e?e:String(e).replace(/^-/,"\\-")},unescapeFacetValue:function(e){return"string"!=typeof e?e:e.replace(/^\\-/,"-")}}},7888:e=>{"use strict";e.exports=function(e,t){if(Array.isArray(e))for(var r=0;r{"use strict";e.exports=function(e,t){if(!Array.isArray(e))return-1;for(var r=0;r{"use strict";var n=r(7888);e.exports=function(e,t){var r=(t||[]).map((function(e){return e.split(":")}));return e.reduce((function(e,t){var i=t.split(":"),a=n(r,(function(e){return e[0]===i[0]}));return i.length>1||!a?(e[0].push(i[0]),e[1].push(i[1]),e):(e[0].push(a[0]),e[1].push(a[1]),e)}),[[],[]])}},14853:e=>{"use strict";e.exports=function(e,t){e.prototype=Object.create(t.prototype,{constructor:{value:e,enumerable:!1,writable:!0,configurable:!0}})}},22686:e=>{"use strict";e.exports=function(e,t){return e.filter((function(r,n){return t.indexOf(r)>-1&&e.indexOf(r)===n}))}},60185:e=>{"use strict";function t(e){return"function"==typeof e||Array.isArray(e)||"[object Object]"===Object.prototype.toString.call(e)}function r(e,n){if(e===n)return e;for(var i in n)if(Object.prototype.hasOwnProperty.call(n,i)&&"__proto__"!==i&&"constructor"!==i){var a=n[i],s=e[i];void 0!==s&&void 0===a||(t(s)&&t(a)?e[i]=r(s,a):e[i]="object"==typeof(c=a)&&null!==c?r(Array.isArray(c)?[]:{},c):c)}var c;return e}e.exports=function(e){t(e)||(e={});for(var n=1,i=arguments.length;n{"use strict";e.exports=function(e){return e&&Object.keys(e).length>0}},49803:e=>{"use strict";e.exports=function(e,t){if(null===e)return{};var r,n,i={},a=Object.keys(e);for(n=0;n=0||(i[r]=e[r]);return i}},42148:e=>{"use strict";function t(e,t){if(e!==t){var r=void 0!==e,n=null===e,i=void 0!==t,a=null===t;if(!a&&e>t||n&&i||!r)return 1;if(!n&&e=n.length?a:"desc"===n[i]?-a:a}return e.index-r.index})),i.map((function(e){return e.value}))}},28023:e=>{"use strict";e.exports=function e(t){if("number"==typeof t)return t;if("string"==typeof t)return parseFloat(t);if(Array.isArray(t))return t.map(e);throw new Error("The value should be a number, a parsable string or an array of those.")}},96394:(e,t,r)=>{"use strict";var n=r(60185);function i(e){return Object.keys(e).sort((function(e,t){return e.localeCompare(t)})).reduce((function(t,r){return t[r]=e[r],t}),{})}var a={_getQueries:function(e,t){var r=[];return r.push({indexName:e,params:a._getHitsSearchParams(t)}),t.getRefinedDisjunctiveFacets().forEach((function(n){r.push({indexName:e,params:a._getDisjunctiveFacetSearchParams(t,n)})})),t.getRefinedHierarchicalFacets().forEach((function(n){var i=t.getHierarchicalFacetByName(n),s=t.getHierarchicalRefinement(n),c=t._getHierarchicalFacetSeparator(i);if(s.length>0&&s[0].split(c).length>1){var u=s[0].split(c).slice(0,-1).reduce((function(e,t,r){return e.concat({attribute:i.attributes[r],value:0===r?t:[e[e.length-1].value,t].join(c)})}),[]);u.forEach((function(n,s){var c=a._getDisjunctiveFacetSearchParams(t,n.attribute,0===s);function o(e){return i.attributes.some((function(t){return t===e.split(":")[0]}))}var h=(c.facetFilters||[]).reduce((function(e,t){if(Array.isArray(t)){var r=t.filter((function(e){return!o(e)}));r.length>0&&e.push(r)}return"string"!=typeof t||o(t)||e.push(t),e}),[]),f=u[s-1];c.facetFilters=s>0?h.concat(f.attribute+":"+f.value):h.length>0?h:void 0,r.push({indexName:e,params:c})}))}})),r},_getHitsSearchParams:function(e){var t=e.facets.concat(e.disjunctiveFacets).concat(a._getHitsHierarchicalFacetsAttributes(e)),r=a._getFacetFilters(e),s=a._getNumericFilters(e),c=a._getTagFilters(e),u={facets:t.indexOf("*")>-1?["*"]:t,tagFilters:c};return r.length>0&&(u.facetFilters=r),s.length>0&&(u.numericFilters=s),i(n({},e.getQueryParams(),u))},_getDisjunctiveFacetSearchParams:function(e,t,r){var s=a._getFacetFilters(e,t,r),c=a._getNumericFilters(e,t),u=a._getTagFilters(e),o={hitsPerPage:0,page:0,analytics:!1,clickAnalytics:!1};u.length>0&&(o.tagFilters=u);var h=e.getHierarchicalFacetByName(t);return o.facets=h?a._getDisjunctiveHierarchicalFacetAttribute(e,h,r):t,c.length>0&&(o.numericFilters=c),s.length>0&&(o.facetFilters=s),i(n({},e.getQueryParams(),o))},_getNumericFilters:function(e,t){if(e.numericFilters)return e.numericFilters;var r=[];return Object.keys(e.numericRefinements).forEach((function(n){var i=e.numericRefinements[n]||{};Object.keys(i).forEach((function(e){var a=i[e]||[];t!==n&&a.forEach((function(t){if(Array.isArray(t)){var i=t.map((function(t){return n+e+t}));r.push(i)}else r.push(n+e+t)}))}))})),r},_getTagFilters:function(e){return e.tagFilters?e.tagFilters:e.tagRefinements.join(",")},_getFacetFilters:function(e,t,r){var n=[],i=e.facetsRefinements||{};Object.keys(i).forEach((function(e){(i[e]||[]).forEach((function(t){n.push(e+":"+t)}))}));var a=e.facetsExcludes||{};Object.keys(a).forEach((function(e){(a[e]||[]).forEach((function(t){n.push(e+":-"+t)}))}));var s=e.disjunctiveFacetsRefinements||{};Object.keys(s).forEach((function(e){var r=s[e]||[];if(e!==t&&r&&0!==r.length){var i=[];r.forEach((function(t){i.push(e+":"+t)})),n.push(i)}}));var c=e.hierarchicalFacetsRefinements||{};return Object.keys(c).forEach((function(i){var a=(c[i]||[])[0];if(void 0!==a){var s,u,o=e.getHierarchicalFacetByName(i),h=e._getHierarchicalFacetSeparator(o),f=e._getHierarchicalRootPath(o);if(t===i){if(-1===a.indexOf(h)||!f&&!0===r||f&&f.split(h).length===a.split(h).length)return;f?(u=f.split(h).length-1,a=f):(u=a.split(h).length-2,a=a.slice(0,a.lastIndexOf(h))),s=o.attributes[u]}else u=a.split(h).length-1,s=o.attributes[u];s&&n.push([s+":"+a])}})),n},_getHitsHierarchicalFacetsAttributes:function(e){return e.hierarchicalFacets.reduce((function(t,r){var n=e.getHierarchicalRefinement(r.name)[0];if(!n)return t.push(r.attributes[0]),t;var i=e._getHierarchicalFacetSeparator(r),a=n.split(i).length,s=r.attributes.slice(0,a+1);return t.concat(s)}),[])},_getDisjunctiveHierarchicalFacetAttribute:function(e,t,r){var n=e._getHierarchicalFacetSeparator(t);if(!0===r){var i=e._getHierarchicalRootPath(t),a=0;return i&&(a=i.split(n).length),[t.attributes[a]]}var s=(e.getHierarchicalRefinement(t.name)[0]||"").split(n).length-1;return t.attributes.slice(0,s+1)},getSearchForFacetQuery:function(e,t,r,s){var c=s.isDisjunctiveFacet(e)?s.clearRefinements(e):s,u={facetQuery:t,facetName:e};return"number"==typeof r&&(u.maxFacetHits=r),i(n({},a._getHitsSearchParams(c),u))}};e.exports=a},46801:e=>{"use strict";e.exports=function(e){return null!==e&&/^[a-zA-Z0-9_-]{1,64}$/.test(e)}},24336:e=>{"use strict";e.exports="3.13.2"},70290:function(e){e.exports=function(){"use strict";function e(e,t,r){return t in e?Object.defineProperty(e,t,{value:r,enumerable:!0,configurable:!0,writable:!0}):e[t]=r,e}function t(e,t){var r=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),r.push.apply(r,n)}return r}function r(r){for(var n=1;n=0||(i[r]=e[r]);return i}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(i[r]=e[r])}return i}function i(e,t){return function(e){if(Array.isArray(e))return e}(e)||function(e,t){if(Symbol.iterator in Object(e)||"[object Arguments]"===Object.prototype.toString.call(e)){var r=[],n=!0,i=!1,a=void 0;try{for(var s,c=e[Symbol.iterator]();!(n=(s=c.next()).done)&&(r.push(s.value),!t||r.length!==t);n=!0);}catch(e){i=!0,a=e}finally{try{n||null==c.return||c.return()}finally{if(i)throw a}}return r}}(e,t)||function(){throw new TypeError("Invalid attempt to destructure non-iterable instance")}()}function a(e){return function(e){if(Array.isArray(e)){for(var t=0,r=new Array(e.length);t2&&void 0!==arguments[2]?arguments[2]:{miss:function(){return Promise.resolve()}};return Promise.resolve().then((function(){var r=JSON.stringify(e),n=a()[r];return Promise.all([n||t(),void 0!==n])})).then((function(e){var t=i(e,2),n=t[0],a=t[1];return Promise.all([n,a||r.miss(n)])})).then((function(e){return i(e,1)[0]}))},set:function(e,t){return Promise.resolve().then((function(){var i=a();return i[JSON.stringify(e)]=t,n().setItem(r,JSON.stringify(i)),t}))},delete:function(e){return Promise.resolve().then((function(){var t=a();delete t[JSON.stringify(e)],n().setItem(r,JSON.stringify(t))}))},clear:function(){return Promise.resolve().then((function(){n().removeItem(r)}))}}}function c(e){var t=a(e.caches),r=t.shift();return void 0===r?{get:function(e,t){var r=arguments.length>2&&void 0!==arguments[2]?arguments[2]:{miss:function(){return Promise.resolve()}};return t().then((function(e){return Promise.all([e,r.miss(e)])})).then((function(e){return i(e,1)[0]}))},set:function(e,t){return Promise.resolve(t)},delete:function(e){return Promise.resolve()},clear:function(){return Promise.resolve()}}:{get:function(e,n){var i=arguments.length>2&&void 0!==arguments[2]?arguments[2]:{miss:function(){return Promise.resolve()}};return r.get(e,n,i).catch((function(){return c({caches:t}).get(e,n,i)}))},set:function(e,n){return r.set(e,n).catch((function(){return c({caches:t}).set(e,n)}))},delete:function(e){return r.delete(e).catch((function(){return c({caches:t}).delete(e)}))},clear:function(){return r.clear().catch((function(){return c({caches:t}).clear()}))}}}function u(){var e=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{serializable:!0},t={};return{get:function(r,n){var i=arguments.length>2&&void 0!==arguments[2]?arguments[2]:{miss:function(){return Promise.resolve()}},a=JSON.stringify(r);if(a in t)return Promise.resolve(e.serializable?JSON.parse(t[a]):t[a]);var s=n(),c=i&&i.miss||function(){return Promise.resolve()};return s.then((function(e){return c(e)})).then((function(){return s}))},set:function(r,n){return t[JSON.stringify(r)]=e.serializable?JSON.stringify(n):n,Promise.resolve(n)},delete:function(e){return delete t[JSON.stringify(e)],Promise.resolve()},clear:function(){return t={},Promise.resolve()}}}function o(e){for(var t=e.length-1;t>0;t--){var r=Math.floor(Math.random()*(t+1)),n=e[t];e[t]=e[r],e[r]=n}return e}function h(e,t){return t?(Object.keys(t).forEach((function(r){e[r]=t[r](e)})),e):e}function f(e){for(var t=arguments.length,r=new Array(t>1?t-1:0),n=1;n0?n:void 0,timeout:r.timeout||t,headers:r.headers||{},queryParameters:r.queryParameters||{},cacheable:r.cacheable}}var d={Read:1,Write:2,Any:3},p=1,v=2,g=3;function y(e){var t=arguments.length>1&&void 0!==arguments[1]?arguments[1]:p;return r(r({},e),{},{status:t,lastUpdate:Date.now()})}function R(e){return"string"==typeof e?{protocol:"https",url:e,accept:d.Any}:{protocol:e.protocol||"https",url:e.url,accept:e.accept||d.Any}}var F="GET",b="POST";function P(e,t){return Promise.all(t.map((function(t){return e.get(t,(function(){return Promise.resolve(y(t))}))}))).then((function(e){var r=e.filter((function(e){return function(e){return e.status===p||Date.now()-e.lastUpdate>12e4}(e)})),n=e.filter((function(e){return function(e){return e.status===g&&Date.now()-e.lastUpdate<=12e4}(e)})),i=[].concat(a(r),a(n));return{getTimeout:function(e,t){return(0===n.length&&0===e?1:n.length+3+e)*t},statelessHosts:i.length>0?i.map((function(e){return R(e)})):t}}))}function j(e,t,n,i){var s=[],c=function(e,t){if(e.method!==F&&(void 0!==e.data||void 0!==t.data)){var n=Array.isArray(e.data)?e.data:r(r({},e.data),t.data);return JSON.stringify(n)}}(n,i),u=function(e,t){var n=r(r({},e.headers),t.headers),i={};return Object.keys(n).forEach((function(e){var t=n[e];i[e.toLowerCase()]=t})),i}(e,i),o=n.method,h=n.method!==F?{}:r(r({},n.data),i.data),f=r(r(r({"x-algolia-agent":e.userAgent.value},e.queryParameters),h),i.queryParameters),l=0,m=function t(r,a){var h=r.pop();if(void 0===h)throw{name:"RetryError",message:"Unreachable hosts - your application id may be incorrect. If the error persists, contact support@algolia.com.",transporterStackTrace:w(s)};var m={data:c,headers:u,method:o,url:E(h,n.path,f),connectTimeout:a(l,e.timeouts.connect),responseTimeout:a(l,i.timeout)},d=function(e){var t={request:m,response:e,host:h,triesLeft:r.length};return s.push(t),t},p={onSuccess:function(e){return function(e){try{return JSON.parse(e.content)}catch(t){throw function(e,t){return{name:"DeserializationError",message:e,response:t}}(t.message,e)}}(e)},onRetry:function(n){var i=d(n);return n.isTimedOut&&l++,Promise.all([e.logger.info("Retryable failure",O(i)),e.hostsCache.set(h,y(h,n.isTimedOut?g:v))]).then((function(){return t(r,a)}))},onFail:function(e){throw d(e),function(e,t){var r=e.content,n=e.status,i=r;try{i=JSON.parse(r).message}catch(e){}return function(e,t,r){return{name:"ApiError",message:e,status:t,transporterStackTrace:r}}(i,n,t)}(e,w(s))}};return e.requester.send(m).then((function(e){return function(e,t){return function(e){var t=e.status;return e.isTimedOut||function(e){var t=e.isTimedOut,r=e.status;return!t&&0==~~r}(e)||2!=~~(t/100)&&4!=~~(t/100)}(e)?t.onRetry(e):2==~~(e.status/100)?t.onSuccess(e):t.onFail(e)}(e,p)}))};return P(e.hostsCache,t).then((function(e){return m(a(e.statelessHosts).reverse(),e.getTimeout)}))}function _(e){var t={value:"Algolia for JavaScript (".concat(e,")"),add:function(e){var r="; ".concat(e.segment).concat(void 0!==e.version?" (".concat(e.version,")"):"");return-1===t.value.indexOf(r)&&(t.value="".concat(t.value).concat(r)),t}};return t}function E(e,t,r){var n=x(r),i="".concat(e.protocol,"://").concat(e.url,"/").concat("/"===t.charAt(0)?t.substr(1):t);return n.length&&(i+="?".concat(n)),i}function x(e){return Object.keys(e).map((function(t){return f("%s=%s",t,(r=e[t],"[object Object]"===Object.prototype.toString.call(r)||"[object Array]"===Object.prototype.toString.call(r)?JSON.stringify(e[t]):e[t]));var r})).join("&")}function w(e){return e.map((function(e){return O(e)}))}function O(e){var t=e.request.headers["x-algolia-api-key"]?{"x-algolia-api-key":"*****"}:{};return r(r({},e),{},{request:r(r({},e.request),{},{headers:r(r({},e.request.headers),t)})})}var N=function(e){var t=e.appId,n=function(e,t,r){var n={"x-algolia-api-key":r,"x-algolia-application-id":t};return{headers:function(){return e===l.WithinHeaders?n:{}},queryParameters:function(){return e===l.WithinQueryParameters?n:{}}}}(void 0!==e.authMode?e.authMode:l.WithinHeaders,t,e.apiKey),a=function(e){var t=e.hostsCache,r=e.logger,n=e.requester,a=e.requestsCache,s=e.responsesCache,c=e.timeouts,u=e.userAgent,o=e.hosts,h=e.queryParameters,f={hostsCache:t,logger:r,requester:n,requestsCache:a,responsesCache:s,timeouts:c,userAgent:u,headers:e.headers,queryParameters:h,hosts:o.map((function(e){return R(e)})),read:function(e,t){var r=m(t,f.timeouts.read),n=function(){return j(f,f.hosts.filter((function(e){return 0!=(e.accept&d.Read)})),e,r)};if(!0!==(void 0!==r.cacheable?r.cacheable:e.cacheable))return n();var a={request:e,mappedRequestOptions:r,transporter:{queryParameters:f.queryParameters,headers:f.headers}};return f.responsesCache.get(a,(function(){return f.requestsCache.get(a,(function(){return f.requestsCache.set(a,n()).then((function(e){return Promise.all([f.requestsCache.delete(a),e])}),(function(e){return Promise.all([f.requestsCache.delete(a),Promise.reject(e)])})).then((function(e){var t=i(e,2);return t[0],t[1]}))}))}),{miss:function(e){return f.responsesCache.set(a,e)}})},write:function(e,t){return j(f,f.hosts.filter((function(e){return 0!=(e.accept&d.Write)})),e,m(t,f.timeouts.write))}};return f}(r(r({hosts:[{url:"".concat(t,"-dsn.algolia.net"),accept:d.Read},{url:"".concat(t,".algolia.net"),accept:d.Write}].concat(o([{url:"".concat(t,"-1.algolianet.com")},{url:"".concat(t,"-2.algolianet.com")},{url:"".concat(t,"-3.algolianet.com")}]))},e),{},{headers:r(r(r({},n.headers()),{"content-type":"application/x-www-form-urlencoded"}),e.headers),queryParameters:r(r({},n.queryParameters()),e.queryParameters)}));return h({transporter:a,appId:t,addAlgoliaAgent:function(e,t){a.userAgent.add({segment:e,version:t})},clearCache:function(){return Promise.all([a.requestsCache.clear(),a.responsesCache.clear()]).then((function(){}))}},e.methods)},A=function(e){return function(t,r){return t.method===F?e.transporter.read(t,r):e.transporter.write(t,r)}},H=function(e){return function(t){var r=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{};return h({transporter:e.transporter,appId:e.appId,indexName:t},r.methods)}},S=function(e){return function(t,n){var i=t.map((function(e){return r(r({},e),{},{params:x(e.params||{})})}));return e.transporter.read({method:b,path:"1/indexes/*/queries",data:{requests:i},cacheable:!0},n)}},T=function(e){return function(t,i){return Promise.all(t.map((function(t){var a=t.params,s=a.facetName,c=a.facetQuery,u=n(a,["facetName","facetQuery"]);return H(e)(t.indexName,{methods:{searchForFacetValues:k}}).searchForFacetValues(s,c,r(r({},i),u))})))}},Q=function(e){return function(t,r,n){return e.transporter.read({method:b,path:f("1/answers/%s/prediction",e.indexName),data:{query:t,queryLanguages:r},cacheable:!0},n)}},C=function(e){return function(t,r){return e.transporter.read({method:b,path:f("1/indexes/%s/query",e.indexName),data:{query:t},cacheable:!0},r)}},k=function(e){return function(t,r,n){return e.transporter.read({method:b,path:f("1/indexes/%s/facets/%s/query",e.indexName,t),data:{facetQuery:r},cacheable:!0},n)}},I=1,D=2,q=3;function V(e,t,n){var i,a={appId:e,apiKey:t,timeouts:{connect:1,read:2,write:30},requester:{send:function(e){return new Promise((function(t){var r=new XMLHttpRequest;r.open(e.method,e.url,!0),Object.keys(e.headers).forEach((function(t){return r.setRequestHeader(t,e.headers[t])}));var n,i=function(e,n){return setTimeout((function(){r.abort(),t({status:0,content:n,isTimedOut:!0})}),1e3*e)},a=i(e.connectTimeout,"Connection timeout");r.onreadystatechange=function(){r.readyState>r.OPENED&&void 0===n&&(clearTimeout(a),n=i(e.responseTimeout,"Socket timeout"))},r.onerror=function(){0===r.status&&(clearTimeout(a),clearTimeout(n),t({content:r.responseText||"Network request failed",status:r.status,isTimedOut:!1}))},r.onload=function(){clearTimeout(a),clearTimeout(n),t({content:r.responseText,status:r.status,isTimedOut:!1})},r.send(e.data)}))}},logger:(i=q,{debug:function(e,t){return I>=i&&console.debug(e,t),Promise.resolve()},info:function(e,t){return D>=i&&console.info(e,t),Promise.resolve()},error:function(e,t){return console.error(e,t),Promise.resolve()}}),responsesCache:u(),requestsCache:u({serializable:!1}),hostsCache:c({caches:[s({key:"".concat("4.17.2","-").concat(e)}),u()]}),userAgent:_("4.17.2").add({segment:"Browser",version:"lite"}),authMode:l.WithinQueryParameters};return N(r(r(r({},a),n),{},{methods:{search:S,searchForFacetValues:T,multipleQueries:S,multipleSearchForFacetValues:T,customRequest:A,initIndex:function(e){return function(t){return H(e)(t,{methods:{search:C,searchForFacetValues:k,findAnswers:Q}})}}}}))}return V.version="4.17.2",V}()},56675:(e,t,r)=>{"use strict";r.r(t),r.d(t,{default:()=>A});var n=r(67294),i=r(86010),a=r(8131),s=r.n(a),c=r(70290),u=r.n(c),o=r(10412),h=r(35742),f=r(39960),l=r(80143),m=r(52263),d=["zero","one","two","few","many","other"];function p(e){return d.filter((function(t){return e.includes(t)}))}var v={locale:"en",pluralForms:p(["one","other"]),select:function(e){return 1===e?"one":"other"}};function g(){var e=(0,m.Z)().i18n.currentLocale;return(0,n.useMemo)((function(){try{return t=e,r=new Intl.PluralRules(t),{locale:t,pluralForms:p(r.resolvedOptions().pluralCategories),select:function(e){return r.select(e)}}}catch(n){return console.error('Failed to use Intl.PluralRules for locale "'+e+'".\nDocusaurus will fallback to the default (English) implementation.\nError: '+n.message+"\n"),v}var t,r}),[e])}function y(){var e=g();return{selectMessage:function(t,r){return function(e,t,r){var n=e.split("|");if(1===n.length)return n[0];n.length>r.pluralForms.length&&console.error("For locale="+r.locale+", a maximum of "+r.pluralForms.length+" plural forms are expected ("+r.pluralForms.join(",")+"), but the message contains "+n.length+": "+e);var i=r.select(t),a=r.pluralForms.indexOf(i);return n[Math.min(a,n.length-1)]}(r,t,e)}}}var R=r(66177),F=r(69688),b=r(10833),P=r(82128),j=r(95999),_=r(6278),E=r(239),x=r(7452);const w={searchQueryInput:"searchQueryInput_u2C7",searchVersionInput:"searchVersionInput_m0Ui",searchResultsColumn:"searchResultsColumn_JPFH",algoliaLogo:"algoliaLogo_rT1R",algoliaLogoPathFill:"algoliaLogoPathFill_WdUC",searchResultItem:"searchResultItem_Tv2o",searchResultItemHeading:"searchResultItemHeading_KbCB",searchResultItemPath:"searchResultItemPath_lhe1",searchResultItemSummary:"searchResultItemSummary_AEaO",searchQueryColumn:"searchQueryColumn_RTkw",searchVersionColumn:"searchVersionColumn_ypXd",searchLogoColumn:"searchLogoColumn_rJIA",loadingSpinner:"loadingSpinner_XVxU","loading-spin":"loading-spin_vzvp",loader:"loader_vvXV"};function O(e){var t=e.docsSearchVersionsHelpers,r=Object.entries(t.allDocsData).filter((function(e){return e[1].versions.length>1}));return n.createElement("div",{className:(0,i.Z)("col","col--3","padding-left--none",w.searchVersionColumn)},r.map((function(e){var i=e[0],a=e[1],s=r.length>1?i+": ":"";return n.createElement("select",{key:i,onChange:function(e){return t.setSearchVersion(i,e.target.value)},defaultValue:t.searchVersions[i],className:w.searchVersionInput},a.versions.map((function(e,t){return n.createElement("option",{key:t,label:""+s+e.label,value:e.name})})))})))}function N(){var e,t,r,a,c,d,p=(0,m.Z)().i18n.currentLocale,v=(0,_.L)().algolia,g=v.appId,b=v.apiKey,N=v.indexName,A=(0,E.l)(),H=(e=y().selectMessage,function(t){return e(t,(0,j.I)({id:"theme.SearchPage.documentsFound.plurals",description:'Pluralized label for "{count} documents found". Use as much plural forms (separated by "|") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)',message:"One document found|{count} documents found"},{count:t}))}),S=(t=(0,l._r)(),r=(0,n.useState)((function(){return Object.entries(t).reduce((function(e,t){var r,n=t[0],i=t[1];return Object.assign({},e,((r={})[n]=i.versions[0].name,r))}),{})})),a=r[0],c=r[1],d=Object.values(t).some((function(e){return e.versions.length>1})),{allDocsData:t,versioningEnabled:d,searchVersions:a,setSearchVersion:function(e,t){return c((function(r){var n;return Object.assign({},r,((n={})[e]=t,n))}))}}),T=(0,R.K)(),Q=T[0],C=T[1],k={items:[],query:null,totalResults:null,totalPages:null,lastPage:null,hasMore:null,loading:null},I=(0,n.useReducer)((function(e,t){switch(t.type){case"reset":return k;case"loading":return Object.assign({},e,{loading:!0});case"update":return Q!==t.value.query?e:Object.assign({},t.value,{items:0===t.value.lastPage?t.value.items:e.items.concat(t.value.items)});case"advance":var r=e.totalPages>e.lastPage+1;return Object.assign({},e,{lastPage:r?e.lastPage+1:e.lastPage,hasMore:r});default:return e}}),k),D=I[0],q=I[1],V=u()(g,b),L=s()(V,N,{hitsPerPage:15,advancedSyntax:!0,disjunctiveFacets:["language","docusaurus_tag"]});L.on("result",(function(e){var t=e.results,r=t.query,n=t.hits,i=t.page,a=t.nbHits,s=t.nbPages;if(""!==r&&Array.isArray(n)){var c=function(e){return e.replace(/algolia-docsearch-suggestion--highlight/g,"search-result-match")},u=n.map((function(e){var t=e.url,r=e._highlightResult.hierarchy,n=e._snippetResult,i=void 0===n?{}:n,a=Object.keys(r).map((function(e){return c(r[e].value)}));return{title:a.pop(),url:A(t),summary:i.content?c(i.content.value)+"...":"",breadcrumbs:a}}));q({type:"update",value:{items:u,query:r,totalResults:a,totalPages:s,lastPage:i,hasMore:s>i+1,loading:!1}})}else q({type:"reset"})}));var B=(0,n.useState)(null),z=B[0],M=B[1],J=(0,n.useRef)(0),W=(0,n.useRef)(o.Z.canUseIntersectionObserver&&new IntersectionObserver((function(e){var t=e[0],r=t.isIntersecting,n=t.boundingClientRect.y;r&&J.current>n&&q({type:"advance"}),J.current=n}),{threshold:1})),U=function(){return Q?(0,j.I)({id:"theme.SearchPage.existingResultsTitle",message:'Search results for "{query}"',description:"The search page title for non-empty query"},{query:Q}):(0,j.I)({id:"theme.SearchPage.emptyResultsTitle",message:"Search the documentation",description:"The search page title for empty query"})},Z=(0,F.zX)((function(e){void 0===e&&(e=0),L.addDisjunctiveFacetRefinement("docusaurus_tag","default"),L.addDisjunctiveFacetRefinement("language",p),Object.entries(S.searchVersions).forEach((function(e){var t=e[0],r=e[1];L.addDisjunctiveFacetRefinement("docusaurus_tag","docs-"+t+"-"+r)})),L.setQuery(Q).setPage(e).search()}));return(0,n.useEffect)((function(){if(z){var e=W.current;return e?(e.observe(z),function(){return e.unobserve(z)}):function(){return!0}}}),[z]),(0,n.useEffect)((function(){q({type:"reset"}),Q&&(q({type:"loading"}),setTimeout((function(){Z()}),300))}),[Q,S.searchVersions,Z]),(0,n.useEffect)((function(){D.lastPage&&0!==D.lastPage&&Z(D.lastPage)}),[Z,D.lastPage]),n.createElement(x.Z,null,n.createElement(h.Z,null,n.createElement("title",null,(0,P.p)(U())),n.createElement("meta",{property:"robots",content:"noindex, follow"})),n.createElement("div",{className:"container margin-vert--lg"},n.createElement("h1",null,U()),n.createElement("form",{className:"row",onSubmit:function(e){return e.preventDefault()}},n.createElement("div",{className:(0,i.Z)("col",w.searchQueryColumn,{"col--9":S.versioningEnabled,"col--12":!S.versioningEnabled})},n.createElement("input",{type:"search",name:"q",className:w.searchQueryInput,placeholder:(0,j.I)({id:"theme.SearchPage.inputPlaceholder",message:"Type your search here",description:"The placeholder for search page input"}),"aria-label":(0,j.I)({id:"theme.SearchPage.inputLabel",message:"Search",description:"The ARIA label for search page input"}),onChange:function(e){return C(e.target.value)},value:Q,autoComplete:"off",autoFocus:!0})),S.versioningEnabled&&n.createElement(O,{docsSearchVersionsHelpers:S})),n.createElement("div",{className:"row"},n.createElement("div",{className:(0,i.Z)("col","col--8",w.searchResultsColumn)},!!D.totalResults&&H(D.totalResults)),n.createElement("div",{className:(0,i.Z)("col","col--4","text--right",w.searchLogoColumn)},n.createElement("a",{target:"_blank",rel:"noopener noreferrer",href:"https://www.algolia.com/","aria-label":(0,j.I)({id:"theme.SearchPage.algoliaLabel",message:"Search by Algolia",description:"The ARIA label for Algolia mention"})},n.createElement("svg",{viewBox:"0 0 168 24",className:w.algoliaLogo},n.createElement("g",{fill:"none"},n.createElement("path",{className:w.algoliaLogoPathFill,d:"M120.925 18.804c-4.386.02-4.386-3.54-4.386-4.106l-.007-13.336 2.675-.424v13.254c0 .322 0 2.358 1.718 2.364v2.248zm-10.846-2.18c.821 0 1.43-.047 1.855-.129v-2.719a6.334 6.334 0 0 0-1.574-.199 5.7 5.7 0 0 0-.897.069 2.699 2.699 0 0 0-.814.24c-.24.116-.439.28-.582.491-.15.212-.219.335-.219.656 0 .628.219.991.616 1.23s.938.362 1.615.362zm-.233-9.7c.883 0 1.629.109 2.231.328.602.218 1.088.525 1.444.915.363.396.609.922.76 1.483.157.56.232 1.175.232 1.85v6.874a32.5 32.5 0 0 1-1.868.314c-.834.123-1.772.185-2.813.185-.69 0-1.327-.069-1.895-.198a4.001 4.001 0 0 1-1.471-.636 3.085 3.085 0 0 1-.951-1.134c-.226-.465-.343-1.12-.343-1.803 0-.656.13-1.073.384-1.525a3.24 3.24 0 0 1 1.047-1.106c.445-.287.95-.492 1.532-.615a8.8 8.8 0 0 1 1.82-.185 8.404 8.404 0 0 1 1.972.24v-.438c0-.307-.035-.6-.11-.874a1.88 1.88 0 0 0-.384-.73 1.784 1.784 0 0 0-.724-.493 3.164 3.164 0 0 0-1.143-.205c-.616 0-1.177.075-1.69.164a7.735 7.735 0 0 0-1.26.307l-.321-2.192c.335-.117.834-.233 1.478-.349a10.98 10.98 0 0 1 2.073-.178zm52.842 9.626c.822 0 1.43-.048 1.854-.13V13.7a6.347 6.347 0 0 0-1.574-.199c-.294 0-.595.021-.896.069a2.7 2.7 0 0 0-.814.24 1.46 1.46 0 0 0-.582.491c-.15.212-.218.335-.218.656 0 .628.218.991.615 1.23.404.245.938.362 1.615.362zm-.226-9.694c.883 0 1.629.108 2.231.327.602.219 1.088.526 1.444.915.355.39.609.923.759 1.483a6.8 6.8 0 0 1 .233 1.852v6.873c-.41.088-1.034.19-1.868.314-.834.123-1.772.184-2.813.184-.69 0-1.327-.068-1.895-.198a4.001 4.001 0 0 1-1.471-.635 3.085 3.085 0 0 1-.951-1.134c-.226-.465-.343-1.12-.343-1.804 0-.656.13-1.073.384-1.524.26-.45.608-.82 1.047-1.107.445-.286.95-.491 1.532-.614a8.803 8.803 0 0 1 2.751-.13c.329.034.671.096 1.04.185v-.437a3.3 3.3 0 0 0-.109-.875 1.873 1.873 0 0 0-.384-.731 1.784 1.784 0 0 0-.724-.492 3.165 3.165 0 0 0-1.143-.205c-.616 0-1.177.075-1.69.164a7.75 7.75 0 0 0-1.26.307l-.321-2.193c.335-.116.834-.232 1.478-.348a11.633 11.633 0 0 1 2.073-.177zm-8.034-1.271a1.626 1.626 0 0 1-1.628-1.62c0-.895.725-1.62 1.628-1.62.904 0 1.63.725 1.63 1.62 0 .895-.733 1.62-1.63 1.62zm1.348 13.22h-2.689V7.27l2.69-.423v11.956zm-4.714 0c-4.386.02-4.386-3.54-4.386-4.107l-.008-13.336 2.676-.424v13.254c0 .322 0 2.358 1.718 2.364v2.248zm-8.698-5.903c0-1.156-.253-2.119-.746-2.788-.493-.677-1.183-1.01-2.067-1.01-.882 0-1.574.333-2.065 1.01-.493.676-.733 1.632-.733 2.788 0 1.168.246 1.953.74 2.63.492.683 1.183 1.018 2.066 1.018.882 0 1.574-.342 2.067-1.019.492-.683.738-1.46.738-2.63zm2.737-.007c0 .902-.13 1.584-.397 2.33a5.52 5.52 0 0 1-1.128 1.906 4.986 4.986 0 0 1-1.752 1.223c-.685.286-1.739.45-2.265.45-.528-.006-1.574-.157-2.252-.45a5.096 5.096 0 0 1-1.744-1.223c-.487-.527-.863-1.162-1.137-1.906a6.345 6.345 0 0 1-.41-2.33c0-.902.123-1.77.397-2.508a5.554 5.554 0 0 1 1.15-1.892 5.133 5.133 0 0 1 1.75-1.216c.679-.287 1.425-.423 2.232-.423.808 0 1.553.142 2.237.423a4.88 4.88 0 0 1 1.753 1.216 5.644 5.644 0 0 1 1.135 1.892c.287.738.431 1.606.431 2.508zm-20.138 0c0 1.12.246 2.363.738 2.882.493.52 1.13.78 1.91.78.424 0 .828-.062 1.204-.178.377-.116.677-.253.917-.417V9.33a10.476 10.476 0 0 0-1.766-.226c-.971-.028-1.71.37-2.23 1.004-.513.636-.773 1.75-.773 2.788zm7.438 5.274c0 1.824-.466 3.156-1.404 4.004-.936.846-2.367 1.27-4.296 1.27-.705 0-2.17-.137-3.34-.396l.431-2.118c.98.205 2.272.26 2.95.26 1.074 0 1.84-.219 2.299-.656.459-.437.684-1.086.684-1.948v-.437a8.07 8.07 0 0 1-1.047.397c-.43.13-.93.198-1.492.198-.739 0-1.41-.116-2.018-.349a4.206 4.206 0 0 1-1.567-1.025c-.431-.45-.774-1.017-1.013-1.694-.24-.677-.363-1.885-.363-2.773 0-.834.13-1.88.384-2.577.26-.696.629-1.298 1.129-1.796.493-.498 1.095-.881 1.8-1.162a6.605 6.605 0 0 1 2.428-.457c.87 0 1.67.109 2.45.24.78.129 1.444.265 1.985.415V18.17zM6.972 6.677v1.627c-.712-.446-1.52-.67-2.425-.67-.585 0-1.045.13-1.38.391a1.24 1.24 0 0 0-.502 1.03c0 .425.164.765.494 1.02.33.256.835.532 1.516.83.447.192.795.356 1.045.495.25.138.537.332.862.582.324.25.563.548.718.894.154.345.23.741.23 1.188 0 .947-.334 1.691-1.004 2.234-.67.542-1.537.814-2.601.814-1.18 0-2.16-.229-2.936-.686v-1.708c.84.628 1.814.942 2.92.942.585 0 1.048-.136 1.388-.407.34-.271.51-.646.51-1.125 0-.287-.1-.55-.302-.79-.203-.24-.42-.42-.655-.542-.234-.123-.585-.29-1.053-.503a61.27 61.27 0 0 1-.582-.271 13.67 13.67 0 0 1-.55-.287 4.275 4.275 0 0 1-.567-.351 6.92 6.92 0 0 1-.455-.4c-.18-.17-.31-.34-.39-.51-.08-.17-.155-.37-.224-.598a2.553 2.553 0 0 1-.104-.742c0-.915.333-1.638.998-2.17.664-.532 1.523-.798 2.576-.798.968 0 1.793.17 2.473.51zm7.468 5.696v-.287c-.022-.607-.187-1.088-.495-1.444-.309-.357-.75-.535-1.324-.535-.532 0-.99.194-1.373.583-.382.388-.622.949-.717 1.683h3.909zm1.005 2.792v1.404c-.596.34-1.383.51-2.362.51-1.255 0-2.255-.377-3-1.132-.744-.755-1.116-1.744-1.116-2.968 0-1.297.34-2.316 1.021-3.055.68-.74 1.548-1.11 2.6-1.11 1.033 0 1.852.323 2.458.966.606.644.91 1.572.91 2.784 0 .33-.033.676-.096 1.038h-5.314c.107.702.405 1.239.894 1.611.49.372 1.106.558 1.85.558.862 0 1.58-.202 2.155-.606zm6.605-1.77h-1.212c-.596 0-1.045.116-1.349.35-.303.234-.454.532-.454.894 0 .372.117.664.35.877.235.213.575.32 1.022.32.51 0 .912-.142 1.204-.424.293-.281.44-.651.44-1.108v-.91zm-4.068-2.554V9.325c.627-.361 1.457-.542 2.489-.542 2.116 0 3.175 1.026 3.175 3.08V17h-1.548v-.957c-.415.68-1.143 1.02-2.186 1.02-.766 0-1.38-.22-1.843-.661-.462-.442-.694-1.003-.694-1.684 0-.776.293-1.38.878-1.81.585-.431 1.404-.647 2.457-.647h1.34V11.8c0-.554-.133-.971-.399-1.253-.266-.282-.707-.423-1.324-.423a4.07 4.07 0 0 0-2.345.718zm9.333-1.93v1.42c.394-1 1.101-1.5 2.123-1.5.148 0 .313.016.494.048v1.531a1.885 1.885 0 0 0-.75-.143c-.542 0-.989.24-1.34.718-.351.479-.527 1.048-.527 1.707V17h-1.563V8.91h1.563zm5.01 4.084c.022.82.272 1.492.75 2.019.479.526 1.15.79 2.01.79.639 0 1.235-.176 1.788-.527v1.404c-.521.319-1.186.479-1.995.479-1.265 0-2.276-.4-3.031-1.197-.755-.798-1.133-1.792-1.133-2.984 0-1.16.38-2.151 1.14-2.975.761-.825 1.79-1.237 3.088-1.237.702 0 1.346.149 1.93.447v1.436a3.242 3.242 0 0 0-1.77-.495c-.84 0-1.513.266-2.019.798-.505.532-.758 1.213-.758 2.042zM40.24 5.72v4.579c.458-1 1.293-1.5 2.505-1.5.787 0 1.42.245 1.899.734.479.49.718 1.17.718 2.042V17h-1.564v-5.106c0-.553-.14-.98-.422-1.284-.282-.303-.652-.455-1.11-.455-.531 0-1.002.202-1.411.606-.41.405-.615 1.022-.615 1.851V17h-1.563V5.72h1.563zm14.966 10.02c.596 0 1.096-.253 1.5-.758.404-.506.606-1.157.606-1.955 0-.915-.202-1.62-.606-2.114-.404-.495-.92-.742-1.548-.742-.553 0-1.05.224-1.491.67-.442.447-.662 1.133-.662 2.058 0 .958.212 1.67.638 2.138.425.469.946.703 1.563.703zM53.004 5.72v4.42c.574-.894 1.388-1.341 2.44-1.341 1.022 0 1.857.383 2.506 1.149.649.766.973 1.781.973 3.047 0 1.138-.309 2.109-.925 2.912-.617.803-1.463 1.205-2.537 1.205-1.075 0-1.894-.447-2.457-1.34V17h-1.58V5.72h1.58zm9.908 11.104l-3.223-7.913h1.739l1.005 2.632 1.26 3.415c.096-.32.48-1.458 1.15-3.415l.909-2.632h1.66l-2.92 7.866c-.777 2.074-1.963 3.11-3.559 3.11a2.92 2.92 0 0 1-.734-.079v-1.34c.17.042.351.064.543.064 1.032 0 1.755-.57 2.17-1.708z"}),n.createElement("path",{fill:"#5468FF",d:"M78.988.938h16.594a2.968 2.968 0 0 1 2.966 2.966V20.5a2.967 2.967 0 0 1-2.966 2.964H78.988a2.967 2.967 0 0 1-2.966-2.964V3.897A2.961 2.961 0 0 1 78.988.938z"}),n.createElement("path",{fill:"white",d:"M89.632 5.967v-.772a.978.978 0 0 0-.978-.977h-2.28a.978.978 0 0 0-.978.977v.793c0 .088.082.15.171.13a7.127 7.127 0 0 1 1.984-.28c.65 0 1.295.088 1.917.259.082.02.164-.04.164-.13m-6.248 1.01l-.39-.389a.977.977 0 0 0-1.382 0l-.465.465a.973.973 0 0 0 0 1.38l.383.383c.062.061.15.047.205-.014.226-.307.472-.601.746-.874.281-.28.568-.526.883-.751.068-.042.075-.137.02-.2m4.16 2.453v3.341c0 .096.104.165.192.117l2.97-1.537c.068-.034.089-.117.055-.184a3.695 3.695 0 0 0-3.08-1.866c-.068 0-.136.054-.136.13m0 8.048a4.489 4.489 0 0 1-4.49-4.482 4.488 4.488 0 0 1 4.49-4.482 4.488 4.488 0 0 1 4.489 4.482 4.484 4.484 0 0 1-4.49 4.482m0-10.85a6.363 6.363 0 1 0 0 12.729 6.37 6.37 0 0 0 6.372-6.368 6.358 6.358 0 0 0-6.371-6.36"})))))),D.items.length>0?n.createElement("main",null,D.items.map((function(e,t){var r=e.title,a=e.url,s=e.summary,c=e.breadcrumbs;return n.createElement("article",{key:t,className:w.searchResultItem},n.createElement("h2",{className:w.searchResultItemHeading},n.createElement(f.Z,{to:a,dangerouslySetInnerHTML:{__html:r}})),c.length>0&&n.createElement("nav",{"aria-label":"breadcrumbs"},n.createElement("ul",{className:(0,i.Z)("breadcrumbs",w.searchResultItemPath)},c.map((function(e,t){return n.createElement("li",{key:t,className:"breadcrumbs__item",dangerouslySetInnerHTML:{__html:e}})})))),s&&n.createElement("p",{className:w.searchResultItemSummary,dangerouslySetInnerHTML:{__html:s}}))}))):[Q&&!D.loading&&n.createElement("p",{key:"no-results"},n.createElement(j.Z,{id:"theme.SearchPage.noResultsText",description:"The paragraph for empty search result"},"No results were found")),!!D.loading&&n.createElement("div",{key:"spinner",className:w.loadingSpinner})],D.hasMore&&n.createElement("div",{className:w.loader,ref:M},n.createElement(j.Z,{id:"theme.SearchPage.fetchingNewResults",description:"The paragraph for fetching new search results"},"Fetching new results..."))))}function A(){return n.createElement(b.FG,{className:"search-page-wrapper"},n.createElement(N,null))}}}]); \ No newline at end of file diff --git a/assets/js/1a4e3797.c063a301.js b/assets/js/1a4e3797.c063a301.js new file mode 100644 index 00000000000..00d450e1534 --- /dev/null +++ b/assets/js/1a4e3797.c063a301.js @@ -0,0 +1,2 @@ +/*! For license information please see 1a4e3797.c063a301.js.LICENSE.txt */ +(self.webpackChunk_cumulus_website=self.webpackChunk_cumulus_website||[]).push([[97920],{17331:e=>{function t(){this._events=this._events||{},this._maxListeners=this._maxListeners||void 0}function r(e){return"function"==typeof e}function n(e){return"object"==typeof e&&null!==e}function i(e){return void 0===e}e.exports=t,t.prototype._events=void 0,t.prototype._maxListeners=void 0,t.defaultMaxListeners=10,t.prototype.setMaxListeners=function(e){if("number"!=typeof e||e<0||isNaN(e))throw TypeError("n must be a positive number");return this._maxListeners=e,this},t.prototype.emit=function(e){var t,a,s,c,u,o;if(this._events||(this._events={}),"error"===e&&(!this._events.error||n(this._events.error)&&!this._events.error.length)){if((t=arguments[1])instanceof Error)throw t;var h=new Error('Uncaught, unspecified "error" event. ('+t+")");throw h.context=t,h}if(i(a=this._events[e]))return!1;if(r(a))switch(arguments.length){case 1:a.call(this);break;case 2:a.call(this,arguments[1]);break;case 3:a.call(this,arguments[1],arguments[2]);break;default:c=Array.prototype.slice.call(arguments,1),a.apply(this,c)}else if(n(a))for(c=Array.prototype.slice.call(arguments,1),s=(o=a.slice()).length,u=0;u0&&this._events[e].length>s&&(this._events[e].warned=!0,console.error("(node) warning: possible EventEmitter memory leak detected. %d listeners added. Use emitter.setMaxListeners() to increase limit.",this._events[e].length),"function"==typeof console.trace&&console.trace()),this},t.prototype.on=t.prototype.addListener,t.prototype.once=function(e,t){if(!r(t))throw TypeError("listener must be a function");var n=!1;function i(){this.removeListener(e,i),n||(n=!0,t.apply(this,arguments))}return i.listener=t,this.on(e,i),this},t.prototype.removeListener=function(e,t){var i,a,s,c;if(!r(t))throw TypeError("listener must be a function");if(!this._events||!this._events[e])return this;if(s=(i=this._events[e]).length,a=-1,i===t||r(i.listener)&&i.listener===t)delete this._events[e],this._events.removeListener&&this.emit("removeListener",e,t);else if(n(i)){for(c=s;c-- >0;)if(i[c]===t||i[c].listener&&i[c].listener===t){a=c;break}if(a<0)return this;1===i.length?(i.length=0,delete this._events[e]):i.splice(a,1),this._events.removeListener&&this.emit("removeListener",e,t)}return this},t.prototype.removeAllListeners=function(e){var t,n;if(!this._events)return this;if(!this._events.removeListener)return 0===arguments.length?this._events={}:this._events[e]&&delete this._events[e],this;if(0===arguments.length){for(t in this._events)"removeListener"!==t&&this.removeAllListeners(t);return this.removeAllListeners("removeListener"),this._events={},this}if(r(n=this._events[e]))this.removeListener(e,n);else if(n)for(;n.length;)this.removeListener(e,n[n.length-1]);return delete this._events[e],this},t.prototype.listeners=function(e){return this._events&&this._events[e]?r(this._events[e])?[this._events[e]]:this._events[e].slice():[]},t.prototype.listenerCount=function(e){if(this._events){var t=this._events[e];if(r(t))return 1;if(t)return t.length}return 0},t.listenerCount=function(e,t){return e.listenerCount(t)}},8131:(e,t,r)=>{"use strict";var n=r(49374),i=r(17775),a=r(23076);function s(e,t,r){return new n(e,t,r)}s.version=r(24336),s.AlgoliaSearchHelper=n,s.SearchParameters=i,s.SearchResults=a,e.exports=s},68078:(e,t,r)=>{"use strict";var n=r(17331);function i(e,t){this.main=e,this.fn=t,this.lastResults=null}r(14853)(i,n),i.prototype.detach=function(){this.removeAllListeners(),this.main.detachDerivedHelper(this)},i.prototype.getModifiedState=function(e){return this.fn(e)},e.exports=i},82437:(e,t,r)=>{"use strict";var n=r(52344),i=r(49803),a=r(90116),s={addRefinement:function(e,t,r){if(s.isRefined(e,t,r))return e;var i=""+r,a=e[t]?e[t].concat(i):[i],c={};return c[t]=a,n({},c,e)},removeRefinement:function(e,t,r){if(void 0===r)return s.clearRefinement(e,(function(e,r){return t===r}));var n=""+r;return s.clearRefinement(e,(function(e,r){return t===r&&n===e}))},toggleRefinement:function(e,t,r){if(void 0===r)throw new Error("toggleRefinement should be used with a value");return s.isRefined(e,t,r)?s.removeRefinement(e,t,r):s.addRefinement(e,t,r)},clearRefinement:function(e,t,r){if(void 0===t)return a(e)?{}:e;if("string"==typeof t)return i(e,[t]);if("function"==typeof t){var n=!1,s=Object.keys(e).reduce((function(i,a){var s=e[a]||[],c=s.filter((function(e){return!t(e,a,r)}));return c.length!==s.length&&(n=!0),i[a]=c,i}),{});return n?s:e}},isRefined:function(e,t,r){var n=Boolean(e[t])&&e[t].length>0;if(void 0===r||!n)return n;var i=""+r;return-1!==e[t].indexOf(i)}};e.exports=s},17775:(e,t,r)=>{"use strict";var n=r(60185),i=r(52344),a=r(22686),s=r(7888),c=r(28023),u=r(49803),o=r(90116),h=r(46801),f=r(82437);function l(e,t){return Array.isArray(e)&&Array.isArray(t)?e.length===t.length&&e.every((function(e,r){return l(t[r],e)})):e===t}function m(e){var t=e?m._parseNumbers(e):{};void 0===t.userToken||h(t.userToken)||console.warn("[algoliasearch-helper] The `userToken` parameter is invalid. This can lead to wrong analytics.\n - Format: [a-zA-Z0-9_-]{1,64}"),this.facets=t.facets||[],this.disjunctiveFacets=t.disjunctiveFacets||[],this.hierarchicalFacets=t.hierarchicalFacets||[],this.facetsRefinements=t.facetsRefinements||{},this.facetsExcludes=t.facetsExcludes||{},this.disjunctiveFacetsRefinements=t.disjunctiveFacetsRefinements||{},this.numericRefinements=t.numericRefinements||{},this.tagRefinements=t.tagRefinements||[],this.hierarchicalFacetsRefinements=t.hierarchicalFacetsRefinements||{};var r=this;Object.keys(t).forEach((function(e){var n=-1!==m.PARAMETERS.indexOf(e),i=void 0!==t[e];!n&&i&&(r[e]=t[e])}))}m.PARAMETERS=Object.keys(new m),m._parseNumbers=function(e){if(e instanceof m)return e;var t={};if(["aroundPrecision","aroundRadius","getRankingInfo","minWordSizefor2Typos","minWordSizefor1Typo","page","maxValuesPerFacet","distinct","minimumAroundRadius","hitsPerPage","minProximity"].forEach((function(r){var n=e[r];if("string"==typeof n){var i=parseFloat(n);t[r]=isNaN(i)?n:i}})),Array.isArray(e.insideBoundingBox)&&(t.insideBoundingBox=e.insideBoundingBox.map((function(e){return Array.isArray(e)?e.map((function(e){return parseFloat(e)})):e}))),e.numericRefinements){var r={};Object.keys(e.numericRefinements).forEach((function(t){var n=e.numericRefinements[t]||{};r[t]={},Object.keys(n).forEach((function(e){var i=n[e].map((function(e){return Array.isArray(e)?e.map((function(e){return"string"==typeof e?parseFloat(e):e})):"string"==typeof e?parseFloat(e):e}));r[t][e]=i}))})),t.numericRefinements=r}return n({},e,t)},m.make=function(e){var t=new m(e);return(e.hierarchicalFacets||[]).forEach((function(e){if(e.rootPath){var r=t.getHierarchicalRefinement(e.name);r.length>0&&0!==r[0].indexOf(e.rootPath)&&(t=t.clearRefinements(e.name)),0===(r=t.getHierarchicalRefinement(e.name)).length&&(t=t.toggleHierarchicalFacetRefinement(e.name,e.rootPath))}})),t},m.validate=function(e,t){var r=t||{};return e.tagFilters&&r.tagRefinements&&r.tagRefinements.length>0?new Error("[Tags] Cannot switch from the managed tag API to the advanced API. It is probably an error, if it is really what you want, you should first clear the tags with clearTags method."):e.tagRefinements.length>0&&r.tagFilters?new Error("[Tags] Cannot switch from the advanced tag API to the managed API. It is probably an error, if it is not, you should first clear the tags with clearTags method."):e.numericFilters&&r.numericRefinements&&o(r.numericRefinements)?new Error("[Numeric filters] Can't switch from the advanced to the managed API. It is probably an error, if this is really what you want, you have to first clear the numeric filters."):o(e.numericRefinements)&&r.numericFilters?new Error("[Numeric filters] Can't switch from the managed API to the advanced. It is probably an error, if this is really what you want, you have to first clear the numeric filters."):null},m.prototype={constructor:m,clearRefinements:function(e){var t={numericRefinements:this._clearNumericRefinements(e),facetsRefinements:f.clearRefinement(this.facetsRefinements,e,"conjunctiveFacet"),facetsExcludes:f.clearRefinement(this.facetsExcludes,e,"exclude"),disjunctiveFacetsRefinements:f.clearRefinement(this.disjunctiveFacetsRefinements,e,"disjunctiveFacet"),hierarchicalFacetsRefinements:f.clearRefinement(this.hierarchicalFacetsRefinements,e,"hierarchicalFacet")};return t.numericRefinements===this.numericRefinements&&t.facetsRefinements===this.facetsRefinements&&t.facetsExcludes===this.facetsExcludes&&t.disjunctiveFacetsRefinements===this.disjunctiveFacetsRefinements&&t.hierarchicalFacetsRefinements===this.hierarchicalFacetsRefinements?this:this.setQueryParameters(t)},clearTags:function(){return void 0===this.tagFilters&&0===this.tagRefinements.length?this:this.setQueryParameters({tagFilters:void 0,tagRefinements:[]})},setIndex:function(e){return e===this.index?this:this.setQueryParameters({index:e})},setQuery:function(e){return e===this.query?this:this.setQueryParameters({query:e})},setPage:function(e){return e===this.page?this:this.setQueryParameters({page:e})},setFacets:function(e){return this.setQueryParameters({facets:e})},setDisjunctiveFacets:function(e){return this.setQueryParameters({disjunctiveFacets:e})},setHitsPerPage:function(e){return this.hitsPerPage===e?this:this.setQueryParameters({hitsPerPage:e})},setTypoTolerance:function(e){return this.typoTolerance===e?this:this.setQueryParameters({typoTolerance:e})},addNumericRefinement:function(e,t,r){var i=c(r);if(this.isNumericRefined(e,t,i))return this;var a=n({},this.numericRefinements);return a[e]=n({},a[e]),a[e][t]?(a[e][t]=a[e][t].slice(),a[e][t].push(i)):a[e][t]=[i],this.setQueryParameters({numericRefinements:a})},getConjunctiveRefinements:function(e){return this.isConjunctiveFacet(e)&&this.facetsRefinements[e]||[]},getDisjunctiveRefinements:function(e){return this.isDisjunctiveFacet(e)&&this.disjunctiveFacetsRefinements[e]||[]},getHierarchicalRefinement:function(e){return this.hierarchicalFacetsRefinements[e]||[]},getExcludeRefinements:function(e){return this.isConjunctiveFacet(e)&&this.facetsExcludes[e]||[]},removeNumericRefinement:function(e,t,r){var n=r;return void 0!==n?this.isNumericRefined(e,t,n)?this.setQueryParameters({numericRefinements:this._clearNumericRefinements((function(r,i){return i===e&&r.op===t&&l(r.val,c(n))}))}):this:void 0!==t?this.isNumericRefined(e,t)?this.setQueryParameters({numericRefinements:this._clearNumericRefinements((function(r,n){return n===e&&r.op===t}))}):this:this.isNumericRefined(e)?this.setQueryParameters({numericRefinements:this._clearNumericRefinements((function(t,r){return r===e}))}):this},getNumericRefinements:function(e){return this.numericRefinements[e]||{}},getNumericRefinement:function(e,t){return this.numericRefinements[e]&&this.numericRefinements[e][t]},_clearNumericRefinements:function(e){if(void 0===e)return o(this.numericRefinements)?{}:this.numericRefinements;if("string"==typeof e)return u(this.numericRefinements,[e]);if("function"==typeof e){var t=!1,r=this.numericRefinements,n=Object.keys(r).reduce((function(n,i){var a=r[i],s={};return a=a||{},Object.keys(a).forEach((function(r){var n=a[r]||[],c=[];n.forEach((function(t){e({val:t,op:r},i,"numeric")||c.push(t)})),c.length!==n.length&&(t=!0),s[r]=c})),n[i]=s,n}),{});return t?n:this.numericRefinements}},addFacet:function(e){return this.isConjunctiveFacet(e)?this:this.setQueryParameters({facets:this.facets.concat([e])})},addDisjunctiveFacet:function(e){return this.isDisjunctiveFacet(e)?this:this.setQueryParameters({disjunctiveFacets:this.disjunctiveFacets.concat([e])})},addHierarchicalFacet:function(e){if(this.isHierarchicalFacet(e.name))throw new Error("Cannot declare two hierarchical facets with the same name: `"+e.name+"`");return this.setQueryParameters({hierarchicalFacets:this.hierarchicalFacets.concat([e])})},addFacetRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return f.isRefined(this.facetsRefinements,e,t)?this:this.setQueryParameters({facetsRefinements:f.addRefinement(this.facetsRefinements,e,t)})},addExcludeRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return f.isRefined(this.facetsExcludes,e,t)?this:this.setQueryParameters({facetsExcludes:f.addRefinement(this.facetsExcludes,e,t)})},addDisjunctiveFacetRefinement:function(e,t){if(!this.isDisjunctiveFacet(e))throw new Error(e+" is not defined in the disjunctiveFacets attribute of the helper configuration");return f.isRefined(this.disjunctiveFacetsRefinements,e,t)?this:this.setQueryParameters({disjunctiveFacetsRefinements:f.addRefinement(this.disjunctiveFacetsRefinements,e,t)})},addTagRefinement:function(e){if(this.isTagRefined(e))return this;var t={tagRefinements:this.tagRefinements.concat(e)};return this.setQueryParameters(t)},removeFacet:function(e){return this.isConjunctiveFacet(e)?this.clearRefinements(e).setQueryParameters({facets:this.facets.filter((function(t){return t!==e}))}):this},removeDisjunctiveFacet:function(e){return this.isDisjunctiveFacet(e)?this.clearRefinements(e).setQueryParameters({disjunctiveFacets:this.disjunctiveFacets.filter((function(t){return t!==e}))}):this},removeHierarchicalFacet:function(e){return this.isHierarchicalFacet(e)?this.clearRefinements(e).setQueryParameters({hierarchicalFacets:this.hierarchicalFacets.filter((function(t){return t.name!==e}))}):this},removeFacetRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return f.isRefined(this.facetsRefinements,e,t)?this.setQueryParameters({facetsRefinements:f.removeRefinement(this.facetsRefinements,e,t)}):this},removeExcludeRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return f.isRefined(this.facetsExcludes,e,t)?this.setQueryParameters({facetsExcludes:f.removeRefinement(this.facetsExcludes,e,t)}):this},removeDisjunctiveFacetRefinement:function(e,t){if(!this.isDisjunctiveFacet(e))throw new Error(e+" is not defined in the disjunctiveFacets attribute of the helper configuration");return f.isRefined(this.disjunctiveFacetsRefinements,e,t)?this.setQueryParameters({disjunctiveFacetsRefinements:f.removeRefinement(this.disjunctiveFacetsRefinements,e,t)}):this},removeTagRefinement:function(e){if(!this.isTagRefined(e))return this;var t={tagRefinements:this.tagRefinements.filter((function(t){return t!==e}))};return this.setQueryParameters(t)},toggleRefinement:function(e,t){return this.toggleFacetRefinement(e,t)},toggleFacetRefinement:function(e,t){if(this.isHierarchicalFacet(e))return this.toggleHierarchicalFacetRefinement(e,t);if(this.isConjunctiveFacet(e))return this.toggleConjunctiveFacetRefinement(e,t);if(this.isDisjunctiveFacet(e))return this.toggleDisjunctiveFacetRefinement(e,t);throw new Error("Cannot refine the undeclared facet "+e+"; it should be added to the helper options facets, disjunctiveFacets or hierarchicalFacets")},toggleConjunctiveFacetRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return this.setQueryParameters({facetsRefinements:f.toggleRefinement(this.facetsRefinements,e,t)})},toggleExcludeFacetRefinement:function(e,t){if(!this.isConjunctiveFacet(e))throw new Error(e+" is not defined in the facets attribute of the helper configuration");return this.setQueryParameters({facetsExcludes:f.toggleRefinement(this.facetsExcludes,e,t)})},toggleDisjunctiveFacetRefinement:function(e,t){if(!this.isDisjunctiveFacet(e))throw new Error(e+" is not defined in the disjunctiveFacets attribute of the helper configuration");return this.setQueryParameters({disjunctiveFacetsRefinements:f.toggleRefinement(this.disjunctiveFacetsRefinements,e,t)})},toggleHierarchicalFacetRefinement:function(e,t){if(!this.isHierarchicalFacet(e))throw new Error(e+" is not defined in the hierarchicalFacets attribute of the helper configuration");var r=this._getHierarchicalFacetSeparator(this.getHierarchicalFacetByName(e)),n={};return void 0!==this.hierarchicalFacetsRefinements[e]&&this.hierarchicalFacetsRefinements[e].length>0&&(this.hierarchicalFacetsRefinements[e][0]===t||0===this.hierarchicalFacetsRefinements[e][0].indexOf(t+r))?-1===t.indexOf(r)?n[e]=[]:n[e]=[t.slice(0,t.lastIndexOf(r))]:n[e]=[t],this.setQueryParameters({hierarchicalFacetsRefinements:i({},n,this.hierarchicalFacetsRefinements)})},addHierarchicalFacetRefinement:function(e,t){if(this.isHierarchicalFacetRefined(e))throw new Error(e+" is already refined.");if(!this.isHierarchicalFacet(e))throw new Error(e+" is not defined in the hierarchicalFacets attribute of the helper configuration.");var r={};return r[e]=[t],this.setQueryParameters({hierarchicalFacetsRefinements:i({},r,this.hierarchicalFacetsRefinements)})},removeHierarchicalFacetRefinement:function(e){if(!this.isHierarchicalFacetRefined(e))return this;var t={};return t[e]=[],this.setQueryParameters({hierarchicalFacetsRefinements:i({},t,this.hierarchicalFacetsRefinements)})},toggleTagRefinement:function(e){return this.isTagRefined(e)?this.removeTagRefinement(e):this.addTagRefinement(e)},isDisjunctiveFacet:function(e){return this.disjunctiveFacets.indexOf(e)>-1},isHierarchicalFacet:function(e){return void 0!==this.getHierarchicalFacetByName(e)},isConjunctiveFacet:function(e){return this.facets.indexOf(e)>-1},isFacetRefined:function(e,t){return!!this.isConjunctiveFacet(e)&&f.isRefined(this.facetsRefinements,e,t)},isExcludeRefined:function(e,t){return!!this.isConjunctiveFacet(e)&&f.isRefined(this.facetsExcludes,e,t)},isDisjunctiveFacetRefined:function(e,t){return!!this.isDisjunctiveFacet(e)&&f.isRefined(this.disjunctiveFacetsRefinements,e,t)},isHierarchicalFacetRefined:function(e,t){if(!this.isHierarchicalFacet(e))return!1;var r=this.getHierarchicalRefinement(e);return t?-1!==r.indexOf(t):r.length>0},isNumericRefined:function(e,t,r){if(void 0===r&&void 0===t)return Boolean(this.numericRefinements[e]);var n=this.numericRefinements[e]&&void 0!==this.numericRefinements[e][t];if(void 0===r||!n)return n;var i,a,u=c(r),o=void 0!==(i=this.numericRefinements[e][t],a=u,s(i,(function(e){return l(e,a)})));return n&&o},isTagRefined:function(e){return-1!==this.tagRefinements.indexOf(e)},getRefinedDisjunctiveFacets:function(){var e=this,t=a(Object.keys(this.numericRefinements).filter((function(t){return Object.keys(e.numericRefinements[t]).length>0})),this.disjunctiveFacets);return Object.keys(this.disjunctiveFacetsRefinements).filter((function(t){return e.disjunctiveFacetsRefinements[t].length>0})).concat(t).concat(this.getRefinedHierarchicalFacets())},getRefinedHierarchicalFacets:function(){var e=this;return a(this.hierarchicalFacets.map((function(e){return e.name})),Object.keys(this.hierarchicalFacetsRefinements).filter((function(t){return e.hierarchicalFacetsRefinements[t].length>0})))},getUnrefinedDisjunctiveFacets:function(){var e=this.getRefinedDisjunctiveFacets();return this.disjunctiveFacets.filter((function(t){return-1===e.indexOf(t)}))},managedParameters:["index","facets","disjunctiveFacets","facetsRefinements","hierarchicalFacets","facetsExcludes","disjunctiveFacetsRefinements","numericRefinements","tagRefinements","hierarchicalFacetsRefinements"],getQueryParams:function(){var e=this.managedParameters,t={},r=this;return Object.keys(this).forEach((function(n){var i=r[n];-1===e.indexOf(n)&&void 0!==i&&(t[n]=i)})),t},setQueryParameter:function(e,t){if(this[e]===t)return this;var r={};return r[e]=t,this.setQueryParameters(r)},setQueryParameters:function(e){if(!e)return this;var t=m.validate(this,e);if(t)throw t;var r=this,n=m._parseNumbers(e),i=Object.keys(this).reduce((function(e,t){return e[t]=r[t],e}),{}),a=Object.keys(n).reduce((function(e,t){var r=void 0!==e[t],i=void 0!==n[t];return r&&!i?u(e,[t]):(i&&(e[t]=n[t]),e)}),i);return new this.constructor(a)},resetPage:function(){return void 0===this.page?this:this.setPage(0)},_getHierarchicalFacetSortBy:function(e){return e.sortBy||["isRefined:desc","name:asc"]},_getHierarchicalFacetSeparator:function(e){return e.separator||" > "},_getHierarchicalRootPath:function(e){return e.rootPath||null},_getHierarchicalShowParentLevel:function(e){return"boolean"!=typeof e.showParentLevel||e.showParentLevel},getHierarchicalFacetByName:function(e){return s(this.hierarchicalFacets,(function(t){return t.name===e}))},getHierarchicalFacetBreadcrumb:function(e){if(!this.isHierarchicalFacet(e))return[];var t=this.getHierarchicalRefinement(e)[0];if(!t)return[];var r=this._getHierarchicalFacetSeparator(this.getHierarchicalFacetByName(e));return t.split(r).map((function(e){return e.trim()}))},toString:function(){return JSON.stringify(this,null,2)}},e.exports=m},10210:(e,t,r)=>{"use strict";e.exports=function(e){return function(t,r){var s=e.hierarchicalFacets[r],o=e.hierarchicalFacetsRefinements[s.name]&&e.hierarchicalFacetsRefinements[s.name][0]||"",h=e._getHierarchicalFacetSeparator(s),f=e._getHierarchicalRootPath(s),l=e._getHierarchicalShowParentLevel(s),m=a(e._getHierarchicalFacetSortBy(s)),d=t.every((function(e){return e.exhaustive})),p=function(e,t,r,a,s){return function(o,h,f){var l=o;if(f>0){var m=0;for(l=o;m{"use strict";var n=r(60185),i=r(52344),a=r(42148),s=r(74587),c=r(7888),u=r(69725),o=r(82293),h=r(94039),f=h.escapeFacetValue,l=h.unescapeFacetValue,m=r(10210);function d(e){var t={};return e.forEach((function(e,r){t[e]=r})),t}function p(e,t,r){t&&t[r]&&(e.stats=t[r])}function v(e,t,r){var a=t[0];this._rawResults=t;var o=this;Object.keys(a).forEach((function(e){o[e]=a[e]})),Object.keys(r||{}).forEach((function(e){o[e]=r[e]})),this.processingTimeMS=t.reduce((function(e,t){return void 0===t.processingTimeMS?e:e+t.processingTimeMS}),0),this.disjunctiveFacets=[],this.hierarchicalFacets=e.hierarchicalFacets.map((function(){return[]})),this.facets=[];var h=e.getRefinedDisjunctiveFacets(),f=d(e.facets),v=d(e.disjunctiveFacets),g=1,y=a.facets||{};Object.keys(y).forEach((function(t){var r,n,i=y[t],s=(r=e.hierarchicalFacets,n=t,c(r,(function(e){return(e.attributes||[]).indexOf(n)>-1})));if(s){var h=s.attributes.indexOf(t),l=u(e.hierarchicalFacets,(function(e){return e.name===s.name}));o.hierarchicalFacets[l][h]={attribute:t,data:i,exhaustive:a.exhaustiveFacetsCount}}else{var m,d=-1!==e.disjunctiveFacets.indexOf(t),g=-1!==e.facets.indexOf(t);d&&(m=v[t],o.disjunctiveFacets[m]={name:t,data:i,exhaustive:a.exhaustiveFacetsCount},p(o.disjunctiveFacets[m],a.facets_stats,t)),g&&(m=f[t],o.facets[m]={name:t,data:i,exhaustive:a.exhaustiveFacetsCount},p(o.facets[m],a.facets_stats,t))}})),this.hierarchicalFacets=s(this.hierarchicalFacets),h.forEach((function(r){var s=t[g],c=s&&s.facets?s.facets:{},h=e.getHierarchicalFacetByName(r);Object.keys(c).forEach((function(t){var r,f=c[t];if(h){r=u(e.hierarchicalFacets,(function(e){return e.name===h.name}));var m=u(o.hierarchicalFacets[r],(function(e){return e.attribute===t}));if(-1===m)return;o.hierarchicalFacets[r][m].data=n({},o.hierarchicalFacets[r][m].data,f)}else{r=v[t];var d=a.facets&&a.facets[t]||{};o.disjunctiveFacets[r]={name:t,data:i({},f,d),exhaustive:s.exhaustiveFacetsCount},p(o.disjunctiveFacets[r],s.facets_stats,t),e.disjunctiveFacetsRefinements[t]&&e.disjunctiveFacetsRefinements[t].forEach((function(n){!o.disjunctiveFacets[r].data[n]&&e.disjunctiveFacetsRefinements[t].indexOf(l(n))>-1&&(o.disjunctiveFacets[r].data[n]=0)}))}})),g++})),e.getRefinedHierarchicalFacets().forEach((function(r){var n=e.getHierarchicalFacetByName(r),a=e._getHierarchicalFacetSeparator(n),s=e.getHierarchicalRefinement(r);0===s.length||s[0].split(a).length<2||t.slice(g).forEach((function(t){var r=t&&t.facets?t.facets:{};Object.keys(r).forEach((function(t){var c=r[t],h=u(e.hierarchicalFacets,(function(e){return e.name===n.name})),f=u(o.hierarchicalFacets[h],(function(e){return e.attribute===t}));if(-1!==f){var l={};if(s.length>0){var m=s[0].split(a)[0];l[m]=o.hierarchicalFacets[h][f].data[m]}o.hierarchicalFacets[h][f].data=i(l,c,o.hierarchicalFacets[h][f].data)}})),g++}))})),Object.keys(e.facetsExcludes).forEach((function(t){var r=e.facetsExcludes[t],n=f[t];o.facets[n]={name:t,data:y[t],exhaustive:a.exhaustiveFacetsCount},r.forEach((function(e){o.facets[n]=o.facets[n]||{name:t},o.facets[n].data=o.facets[n].data||{},o.facets[n].data[e]=0}))})),this.hierarchicalFacets=this.hierarchicalFacets.map(m(e)),this.facets=s(this.facets),this.disjunctiveFacets=s(this.disjunctiveFacets),this._state=e}function g(e,t){function r(e){return e.name===t}if(e._state.isConjunctiveFacet(t)){var n=c(e.facets,r);return n?Object.keys(n.data).map((function(r){var i=f(r);return{name:r,escapedValue:i,count:n.data[r],isRefined:e._state.isFacetRefined(t,i),isExcluded:e._state.isExcludeRefined(t,r)}})):[]}if(e._state.isDisjunctiveFacet(t)){var i=c(e.disjunctiveFacets,r);return i?Object.keys(i.data).map((function(r){var n=f(r);return{name:r,escapedValue:n,count:i.data[r],isRefined:e._state.isDisjunctiveFacetRefined(t,n)}})):[]}if(e._state.isHierarchicalFacet(t)){var a=c(e.hierarchicalFacets,r);if(!a)return a;var s=e._state.getHierarchicalFacetByName(t),u=e._state._getHierarchicalFacetSeparator(s),o=l(e._state.getHierarchicalRefinement(t)[0]||"");0===o.indexOf(s.rootPath)&&(o=o.replace(s.rootPath+u,""));var h=o.split(u);return h.unshift(t),y(a,h,0),a}}function y(e,t,r){e.isRefined=e.name===t[r],e.data&&e.data.forEach((function(e){y(e,t,r+1)}))}function R(e,t,r,n){if(n=n||0,Array.isArray(t))return e(t,r[n]);if(!t.data||0===t.data.length)return t;var a=t.data.map((function(t){return R(e,t,r,n+1)})),s=e(a,r[n]);return i({data:s},t)}function F(e,t){var r=c(e,(function(e){return e.name===t}));return r&&r.stats}function b(e,t,r,n,i){var a=c(i,(function(e){return e.name===r})),s=a&&a.data&&a.data[n]?a.data[n]:0,u=a&&a.exhaustive||!1;return{type:t,attributeName:r,name:n,count:s,exhaustive:u}}v.prototype.getFacetByName=function(e){function t(t){return t.name===e}return c(this.facets,t)||c(this.disjunctiveFacets,t)||c(this.hierarchicalFacets,t)},v.DEFAULT_SORT=["isRefined:desc","count:desc","name:asc"],v.prototype.getFacetValues=function(e,t){var r=g(this,e);if(r){var n,s=i({},t,{sortBy:v.DEFAULT_SORT,facetOrdering:!(t&&t.sortBy)}),c=this;if(Array.isArray(r))n=[e];else n=c._state.getHierarchicalFacetByName(r.name).attributes;return R((function(e,t){if(s.facetOrdering){var r=function(e,t){return e.renderingContent&&e.renderingContent.facetOrdering&&e.renderingContent.facetOrdering.values&&e.renderingContent.facetOrdering.values[t]}(c,t);if(r)return function(e,t){var r=[],n=[],i=(t.order||[]).reduce((function(e,t,r){return e[t]=r,e}),{});e.forEach((function(e){var t=e.path||e.name;void 0!==i[t]?r[i[t]]=e:n.push(e)})),r=r.filter((function(e){return e}));var s,c=t.sortRemainingBy;return"hidden"===c?r:(s="alpha"===c?[["path","name"],["asc","asc"]]:[["count"],["desc"]],r.concat(a(n,s[0],s[1])))}(e,r)}if(Array.isArray(s.sortBy)){var n=o(s.sortBy,v.DEFAULT_SORT);return a(e,n[0],n[1])}if("function"==typeof s.sortBy)return function(e,t){return t.sort(e)}(s.sortBy,e);throw new Error("options.sortBy is optional but if defined it must be either an array of string (predicates) or a sorting function")}),r,n)}},v.prototype.getFacetStats=function(e){return this._state.isConjunctiveFacet(e)?F(this.facets,e):this._state.isDisjunctiveFacet(e)?F(this.disjunctiveFacets,e):void 0},v.prototype.getRefinements=function(){var e=this._state,t=this,r=[];return Object.keys(e.facetsRefinements).forEach((function(n){e.facetsRefinements[n].forEach((function(i){r.push(b(e,"facet",n,i,t.facets))}))})),Object.keys(e.facetsExcludes).forEach((function(n){e.facetsExcludes[n].forEach((function(i){r.push(b(e,"exclude",n,i,t.facets))}))})),Object.keys(e.disjunctiveFacetsRefinements).forEach((function(n){e.disjunctiveFacetsRefinements[n].forEach((function(i){r.push(b(e,"disjunctive",n,i,t.disjunctiveFacets))}))})),Object.keys(e.hierarchicalFacetsRefinements).forEach((function(n){e.hierarchicalFacetsRefinements[n].forEach((function(i){r.push(function(e,t,r,n){var i=e.getHierarchicalFacetByName(t),a=e._getHierarchicalFacetSeparator(i),s=r.split(a),u=c(n,(function(e){return e.name===t})),o=s.reduce((function(e,t){var r=e&&c(e.data,(function(e){return e.name===t}));return void 0!==r?r:e}),u),h=o&&o.count||0,f=o&&o.exhaustive||!1,l=o&&o.path||"";return{type:"hierarchical",attributeName:t,name:l,count:h,exhaustive:f}}(e,n,i,t.hierarchicalFacets))}))})),Object.keys(e.numericRefinements).forEach((function(t){var n=e.numericRefinements[t];Object.keys(n).forEach((function(e){n[e].forEach((function(n){r.push({type:"numeric",attributeName:t,name:n,numericValue:n,operator:e})}))}))})),e.tagRefinements.forEach((function(e){r.push({type:"tag",attributeName:"_tags",name:e})})),r},e.exports=v},49374:(e,t,r)=>{"use strict";var n=r(17775),i=r(23076),a=r(68078),s=r(96394),c=r(17331),u=r(14853),o=r(90116),h=r(49803),f=r(60185),l=r(24336),m=r(94039).escapeFacetValue;function d(e,t,r){"function"==typeof e.addAlgoliaAgent&&e.addAlgoliaAgent("JS Helper ("+l+")"),this.setClient(e);var i=r||{};i.index=t,this.state=n.make(i),this.lastResults=null,this._queryId=0,this._lastQueryIdReceived=-1,this.derivedHelpers=[],this._currentNbQueries=0}function p(e){if(e<0)throw new Error("Page requested below 0.");return this._change({state:this.state.setPage(e),isPageReset:!1}),this}function v(){return this.state.page}u(d,c),d.prototype.search=function(){return this._search({onlyWithDerivedHelpers:!1}),this},d.prototype.searchOnlyWithDerivedHelpers=function(){return this._search({onlyWithDerivedHelpers:!0}),this},d.prototype.getQuery=function(){var e=this.state;return s._getHitsSearchParams(e)},d.prototype.searchOnce=function(e,t){var r=e?this.state.setQueryParameters(e):this.state,n=s._getQueries(r.index,r),a=this;if(this._currentNbQueries++,this.emit("searchOnce",{state:r}),!t)return this.client.search(n).then((function(e){return a._currentNbQueries--,0===a._currentNbQueries&&a.emit("searchQueueEmpty"),{content:new i(r,e.results),state:r,_originalResponse:e}}),(function(e){throw a._currentNbQueries--,0===a._currentNbQueries&&a.emit("searchQueueEmpty"),e}));this.client.search(n).then((function(e){a._currentNbQueries--,0===a._currentNbQueries&&a.emit("searchQueueEmpty"),t(null,new i(r,e.results),r)})).catch((function(e){a._currentNbQueries--,0===a._currentNbQueries&&a.emit("searchQueueEmpty"),t(e,null,r)}))},d.prototype.findAnswers=function(e){console.warn("[algoliasearch-helper] answers is no longer supported");var t=this.state,r=this.derivedHelpers[0];if(!r)return Promise.resolve([]);var n=r.getModifiedState(t),i=f({attributesForPrediction:e.attributesForPrediction,nbHits:e.nbHits},{params:h(s._getHitsSearchParams(n),["attributesToSnippet","hitsPerPage","restrictSearchableAttributes","snippetEllipsisText"])}),a="search for answers was called, but this client does not have a function client.initIndex(index).findAnswers";if("function"!=typeof this.client.initIndex)throw new Error(a);var c=this.client.initIndex(n.index);if("function"!=typeof c.findAnswers)throw new Error(a);return c.findAnswers(n.query,e.queryLanguages,i)},d.prototype.searchForFacetValues=function(e,t,r,n){var i="function"==typeof this.client.searchForFacetValues,a="function"==typeof this.client.initIndex;if(!i&&!a&&"function"!=typeof this.client.search)throw new Error("search for facet values (searchable) was called, but this client does not have a function client.searchForFacetValues or client.initIndex(index).searchForFacetValues");var c=this.state.setQueryParameters(n||{}),u=c.isDisjunctiveFacet(e),o=s.getSearchForFacetQuery(e,t,r,c);this._currentNbQueries++;var h,f=this;return i?h=this.client.searchForFacetValues([{indexName:c.index,params:o}]):a?h=this.client.initIndex(c.index).searchForFacetValues(o):(delete o.facetName,h=this.client.search([{type:"facet",facet:e,indexName:c.index,params:o}]).then((function(e){return e.results[0]}))),this.emit("searchForFacetValues",{state:c,facet:e,query:t}),h.then((function(t){return f._currentNbQueries--,0===f._currentNbQueries&&f.emit("searchQueueEmpty"),(t=Array.isArray(t)?t[0]:t).facetHits.forEach((function(t){t.escapedValue=m(t.value),t.isRefined=u?c.isDisjunctiveFacetRefined(e,t.escapedValue):c.isFacetRefined(e,t.escapedValue)})),t}),(function(e){throw f._currentNbQueries--,0===f._currentNbQueries&&f.emit("searchQueueEmpty"),e}))},d.prototype.setQuery=function(e){return this._change({state:this.state.resetPage().setQuery(e),isPageReset:!0}),this},d.prototype.clearRefinements=function(e){return this._change({state:this.state.resetPage().clearRefinements(e),isPageReset:!0}),this},d.prototype.clearTags=function(){return this._change({state:this.state.resetPage().clearTags(),isPageReset:!0}),this},d.prototype.addDisjunctiveFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().addDisjunctiveFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.addDisjunctiveRefine=function(){return this.addDisjunctiveFacetRefinement.apply(this,arguments)},d.prototype.addHierarchicalFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().addHierarchicalFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.addNumericRefinement=function(e,t,r){return this._change({state:this.state.resetPage().addNumericRefinement(e,t,r),isPageReset:!0}),this},d.prototype.addFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().addFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.addRefine=function(){return this.addFacetRefinement.apply(this,arguments)},d.prototype.addFacetExclusion=function(e,t){return this._change({state:this.state.resetPage().addExcludeRefinement(e,t),isPageReset:!0}),this},d.prototype.addExclude=function(){return this.addFacetExclusion.apply(this,arguments)},d.prototype.addTag=function(e){return this._change({state:this.state.resetPage().addTagRefinement(e),isPageReset:!0}),this},d.prototype.removeNumericRefinement=function(e,t,r){return this._change({state:this.state.resetPage().removeNumericRefinement(e,t,r),isPageReset:!0}),this},d.prototype.removeDisjunctiveFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().removeDisjunctiveFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.removeDisjunctiveRefine=function(){return this.removeDisjunctiveFacetRefinement.apply(this,arguments)},d.prototype.removeHierarchicalFacetRefinement=function(e){return this._change({state:this.state.resetPage().removeHierarchicalFacetRefinement(e),isPageReset:!0}),this},d.prototype.removeFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().removeFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.removeRefine=function(){return this.removeFacetRefinement.apply(this,arguments)},d.prototype.removeFacetExclusion=function(e,t){return this._change({state:this.state.resetPage().removeExcludeRefinement(e,t),isPageReset:!0}),this},d.prototype.removeExclude=function(){return this.removeFacetExclusion.apply(this,arguments)},d.prototype.removeTag=function(e){return this._change({state:this.state.resetPage().removeTagRefinement(e),isPageReset:!0}),this},d.prototype.toggleFacetExclusion=function(e,t){return this._change({state:this.state.resetPage().toggleExcludeFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.toggleExclude=function(){return this.toggleFacetExclusion.apply(this,arguments)},d.prototype.toggleRefinement=function(e,t){return this.toggleFacetRefinement(e,t)},d.prototype.toggleFacetRefinement=function(e,t){return this._change({state:this.state.resetPage().toggleFacetRefinement(e,t),isPageReset:!0}),this},d.prototype.toggleRefine=function(){return this.toggleFacetRefinement.apply(this,arguments)},d.prototype.toggleTag=function(e){return this._change({state:this.state.resetPage().toggleTagRefinement(e),isPageReset:!0}),this},d.prototype.nextPage=function(){var e=this.state.page||0;return this.setPage(e+1)},d.prototype.previousPage=function(){var e=this.state.page||0;return this.setPage(e-1)},d.prototype.setCurrentPage=p,d.prototype.setPage=p,d.prototype.setIndex=function(e){return this._change({state:this.state.resetPage().setIndex(e),isPageReset:!0}),this},d.prototype.setQueryParameter=function(e,t){return this._change({state:this.state.resetPage().setQueryParameter(e,t),isPageReset:!0}),this},d.prototype.setState=function(e){return this._change({state:n.make(e),isPageReset:!1}),this},d.prototype.overrideStateWithoutTriggeringChangeEvent=function(e){return this.state=new n(e),this},d.prototype.hasRefinements=function(e){return!!o(this.state.getNumericRefinements(e))||(this.state.isConjunctiveFacet(e)?this.state.isFacetRefined(e):this.state.isDisjunctiveFacet(e)?this.state.isDisjunctiveFacetRefined(e):!!this.state.isHierarchicalFacet(e)&&this.state.isHierarchicalFacetRefined(e))},d.prototype.isExcluded=function(e,t){return this.state.isExcludeRefined(e,t)},d.prototype.isDisjunctiveRefined=function(e,t){return this.state.isDisjunctiveFacetRefined(e,t)},d.prototype.hasTag=function(e){return this.state.isTagRefined(e)},d.prototype.isTagRefined=function(){return this.hasTagRefinements.apply(this,arguments)},d.prototype.getIndex=function(){return this.state.index},d.prototype.getCurrentPage=v,d.prototype.getPage=v,d.prototype.getTags=function(){return this.state.tagRefinements},d.prototype.getRefinements=function(e){var t=[];if(this.state.isConjunctiveFacet(e))this.state.getConjunctiveRefinements(e).forEach((function(e){t.push({value:e,type:"conjunctive"})})),this.state.getExcludeRefinements(e).forEach((function(e){t.push({value:e,type:"exclude"})}));else if(this.state.isDisjunctiveFacet(e)){this.state.getDisjunctiveRefinements(e).forEach((function(e){t.push({value:e,type:"disjunctive"})}))}var r=this.state.getNumericRefinements(e);return Object.keys(r).forEach((function(e){var n=r[e];t.push({value:n,operator:e,type:"numeric"})})),t},d.prototype.getNumericRefinement=function(e,t){return this.state.getNumericRefinement(e,t)},d.prototype.getHierarchicalFacetBreadcrumb=function(e){return this.state.getHierarchicalFacetBreadcrumb(e)},d.prototype._search=function(e){var t=this.state,r=[],n=[];e.onlyWithDerivedHelpers||(n=s._getQueries(t.index,t),r.push({state:t,queriesCount:n.length,helper:this}),this.emit("search",{state:t,results:this.lastResults}));var i=this.derivedHelpers.map((function(e){var n=e.getModifiedState(t),i=n.index?s._getQueries(n.index,n):[];return r.push({state:n,queriesCount:i.length,helper:e}),e.emit("search",{state:n,results:e.lastResults}),i})),a=Array.prototype.concat.apply(n,i),c=this._queryId++;if(this._currentNbQueries++,!a.length)return Promise.resolve({results:[]}).then(this._dispatchAlgoliaResponse.bind(this,r,c));try{this.client.search(a).then(this._dispatchAlgoliaResponse.bind(this,r,c)).catch(this._dispatchAlgoliaError.bind(this,c))}catch(u){this.emit("error",{error:u})}},d.prototype._dispatchAlgoliaResponse=function(e,t,r){if(!(t0},d.prototype._change=function(e){var t=e.state,r=e.isPageReset;t!==this.state&&(this.state=t,this.emit("change",{state:this.state,results:this.lastResults,isPageReset:r}))},d.prototype.clearCache=function(){return this.client.clearCache&&this.client.clearCache(),this},d.prototype.setClient=function(e){return this.client===e||("function"==typeof e.addAlgoliaAgent&&e.addAlgoliaAgent("JS Helper ("+l+")"),this.client=e),this},d.prototype.getClient=function(){return this.client},d.prototype.derive=function(e){var t=new a(this,e);return this.derivedHelpers.push(t),t},d.prototype.detachDerivedHelper=function(e){var t=this.derivedHelpers.indexOf(e);if(-1===t)throw new Error("Derived helper already detached");this.derivedHelpers.splice(t,1)},d.prototype.hasPendingRequests=function(){return this._currentNbQueries>0},e.exports=d},74587:e=>{"use strict";e.exports=function(e){return Array.isArray(e)?e.filter(Boolean):[]}},52344:e=>{"use strict";e.exports=function(){return Array.prototype.slice.call(arguments).reduceRight((function(e,t){return Object.keys(Object(t)).forEach((function(r){void 0!==t[r]&&(void 0!==e[r]&&delete e[r],e[r]=t[r])})),e}),{})}},94039:e=>{"use strict";e.exports={escapeFacetValue:function(e){return"string"!=typeof e?e:String(e).replace(/^-/,"\\-")},unescapeFacetValue:function(e){return"string"!=typeof e?e:e.replace(/^\\-/,"-")}}},7888:e=>{"use strict";e.exports=function(e,t){if(Array.isArray(e))for(var r=0;r{"use strict";e.exports=function(e,t){if(!Array.isArray(e))return-1;for(var r=0;r{"use strict";var n=r(7888);e.exports=function(e,t){var r=(t||[]).map((function(e){return e.split(":")}));return e.reduce((function(e,t){var i=t.split(":"),a=n(r,(function(e){return e[0]===i[0]}));return i.length>1||!a?(e[0].push(i[0]),e[1].push(i[1]),e):(e[0].push(a[0]),e[1].push(a[1]),e)}),[[],[]])}},14853:e=>{"use strict";e.exports=function(e,t){e.prototype=Object.create(t.prototype,{constructor:{value:e,enumerable:!1,writable:!0,configurable:!0}})}},22686:e=>{"use strict";e.exports=function(e,t){return e.filter((function(r,n){return t.indexOf(r)>-1&&e.indexOf(r)===n}))}},60185:e=>{"use strict";function t(e){return"function"==typeof e||Array.isArray(e)||"[object Object]"===Object.prototype.toString.call(e)}function r(e,n){if(e===n)return e;for(var i in n)if(Object.prototype.hasOwnProperty.call(n,i)&&"__proto__"!==i&&"constructor"!==i){var a=n[i],s=e[i];void 0!==s&&void 0===a||(t(s)&&t(a)?e[i]=r(s,a):e[i]="object"==typeof(c=a)&&null!==c?r(Array.isArray(c)?[]:{},c):c)}var c;return e}e.exports=function(e){t(e)||(e={});for(var n=1,i=arguments.length;n{"use strict";e.exports=function(e){return e&&Object.keys(e).length>0}},49803:e=>{"use strict";e.exports=function(e,t){if(null===e)return{};var r,n,i={},a=Object.keys(e);for(n=0;n=0||(i[r]=e[r]);return i}},42148:e=>{"use strict";function t(e,t){if(e!==t){var r=void 0!==e,n=null===e,i=void 0!==t,a=null===t;if(!a&&e>t||n&&i||!r)return 1;if(!n&&e=n.length?a:"desc"===n[i]?-a:a}return e.index-r.index})),i.map((function(e){return e.value}))}},28023:e=>{"use strict";e.exports=function e(t){if("number"==typeof t)return t;if("string"==typeof t)return parseFloat(t);if(Array.isArray(t))return t.map(e);throw new Error("The value should be a number, a parsable string or an array of those.")}},96394:(e,t,r)=>{"use strict";var n=r(60185);function i(e){return Object.keys(e).sort((function(e,t){return e.localeCompare(t)})).reduce((function(t,r){return t[r]=e[r],t}),{})}var a={_getQueries:function(e,t){var r=[];return r.push({indexName:e,params:a._getHitsSearchParams(t)}),t.getRefinedDisjunctiveFacets().forEach((function(n){r.push({indexName:e,params:a._getDisjunctiveFacetSearchParams(t,n)})})),t.getRefinedHierarchicalFacets().forEach((function(n){var i=t.getHierarchicalFacetByName(n),s=t.getHierarchicalRefinement(n),c=t._getHierarchicalFacetSeparator(i);if(s.length>0&&s[0].split(c).length>1){var u=s[0].split(c).slice(0,-1).reduce((function(e,t,r){return e.concat({attribute:i.attributes[r],value:0===r?t:[e[e.length-1].value,t].join(c)})}),[]);u.forEach((function(n,s){var c=a._getDisjunctiveFacetSearchParams(t,n.attribute,0===s);function o(e){return i.attributes.some((function(t){return t===e.split(":")[0]}))}var h=(c.facetFilters||[]).reduce((function(e,t){if(Array.isArray(t)){var r=t.filter((function(e){return!o(e)}));r.length>0&&e.push(r)}return"string"!=typeof t||o(t)||e.push(t),e}),[]),f=u[s-1];c.facetFilters=s>0?h.concat(f.attribute+":"+f.value):h.length>0?h:void 0,r.push({indexName:e,params:c})}))}})),r},_getHitsSearchParams:function(e){var t=e.facets.concat(e.disjunctiveFacets).concat(a._getHitsHierarchicalFacetsAttributes(e)),r=a._getFacetFilters(e),s=a._getNumericFilters(e),c=a._getTagFilters(e),u={facets:t.indexOf("*")>-1?["*"]:t,tagFilters:c};return r.length>0&&(u.facetFilters=r),s.length>0&&(u.numericFilters=s),i(n({},e.getQueryParams(),u))},_getDisjunctiveFacetSearchParams:function(e,t,r){var s=a._getFacetFilters(e,t,r),c=a._getNumericFilters(e,t),u=a._getTagFilters(e),o={hitsPerPage:0,page:0,analytics:!1,clickAnalytics:!1};u.length>0&&(o.tagFilters=u);var h=e.getHierarchicalFacetByName(t);return o.facets=h?a._getDisjunctiveHierarchicalFacetAttribute(e,h,r):t,c.length>0&&(o.numericFilters=c),s.length>0&&(o.facetFilters=s),i(n({},e.getQueryParams(),o))},_getNumericFilters:function(e,t){if(e.numericFilters)return e.numericFilters;var r=[];return Object.keys(e.numericRefinements).forEach((function(n){var i=e.numericRefinements[n]||{};Object.keys(i).forEach((function(e){var a=i[e]||[];t!==n&&a.forEach((function(t){if(Array.isArray(t)){var i=t.map((function(t){return n+e+t}));r.push(i)}else r.push(n+e+t)}))}))})),r},_getTagFilters:function(e){return e.tagFilters?e.tagFilters:e.tagRefinements.join(",")},_getFacetFilters:function(e,t,r){var n=[],i=e.facetsRefinements||{};Object.keys(i).forEach((function(e){(i[e]||[]).forEach((function(t){n.push(e+":"+t)}))}));var a=e.facetsExcludes||{};Object.keys(a).forEach((function(e){(a[e]||[]).forEach((function(t){n.push(e+":-"+t)}))}));var s=e.disjunctiveFacetsRefinements||{};Object.keys(s).forEach((function(e){var r=s[e]||[];if(e!==t&&r&&0!==r.length){var i=[];r.forEach((function(t){i.push(e+":"+t)})),n.push(i)}}));var c=e.hierarchicalFacetsRefinements||{};return Object.keys(c).forEach((function(i){var a=(c[i]||[])[0];if(void 0!==a){var s,u,o=e.getHierarchicalFacetByName(i),h=e._getHierarchicalFacetSeparator(o),f=e._getHierarchicalRootPath(o);if(t===i){if(-1===a.indexOf(h)||!f&&!0===r||f&&f.split(h).length===a.split(h).length)return;f?(u=f.split(h).length-1,a=f):(u=a.split(h).length-2,a=a.slice(0,a.lastIndexOf(h))),s=o.attributes[u]}else u=a.split(h).length-1,s=o.attributes[u];s&&n.push([s+":"+a])}})),n},_getHitsHierarchicalFacetsAttributes:function(e){return e.hierarchicalFacets.reduce((function(t,r){var n=e.getHierarchicalRefinement(r.name)[0];if(!n)return t.push(r.attributes[0]),t;var i=e._getHierarchicalFacetSeparator(r),a=n.split(i).length,s=r.attributes.slice(0,a+1);return t.concat(s)}),[])},_getDisjunctiveHierarchicalFacetAttribute:function(e,t,r){var n=e._getHierarchicalFacetSeparator(t);if(!0===r){var i=e._getHierarchicalRootPath(t),a=0;return i&&(a=i.split(n).length),[t.attributes[a]]}var s=(e.getHierarchicalRefinement(t.name)[0]||"").split(n).length-1;return t.attributes.slice(0,s+1)},getSearchForFacetQuery:function(e,t,r,s){var c=s.isDisjunctiveFacet(e)?s.clearRefinements(e):s,u={facetQuery:t,facetName:e};return"number"==typeof r&&(u.maxFacetHits=r),i(n({},a._getHitsSearchParams(c),u))}};e.exports=a},46801:e=>{"use strict";e.exports=function(e){return null!==e&&/^[a-zA-Z0-9_-]{1,64}$/.test(e)}},24336:e=>{"use strict";e.exports="3.13.3"},70290:function(e){e.exports=function(){"use strict";function e(e,t,r){return t in e?Object.defineProperty(e,t,{value:r,enumerable:!0,configurable:!0,writable:!0}):e[t]=r,e}function t(e,t){var r=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),r.push.apply(r,n)}return r}function r(r){for(var n=1;n=0||(i[r]=e[r]);return i}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(i[r]=e[r])}return i}function i(e,t){return function(e){if(Array.isArray(e))return e}(e)||function(e,t){if(Symbol.iterator in Object(e)||"[object Arguments]"===Object.prototype.toString.call(e)){var r=[],n=!0,i=!1,a=void 0;try{for(var s,c=e[Symbol.iterator]();!(n=(s=c.next()).done)&&(r.push(s.value),!t||r.length!==t);n=!0);}catch(e){i=!0,a=e}finally{try{n||null==c.return||c.return()}finally{if(i)throw a}}return r}}(e,t)||function(){throw new TypeError("Invalid attempt to destructure non-iterable instance")}()}function a(e){return function(e){if(Array.isArray(e)){for(var t=0,r=new Array(e.length);t2&&void 0!==arguments[2]?arguments[2]:{miss:function(){return Promise.resolve()}};return Promise.resolve().then((function(){var r=JSON.stringify(e),n=a()[r];return Promise.all([n||t(),void 0!==n])})).then((function(e){var t=i(e,2),n=t[0],a=t[1];return Promise.all([n,a||r.miss(n)])})).then((function(e){return i(e,1)[0]}))},set:function(e,t){return Promise.resolve().then((function(){var i=a();return i[JSON.stringify(e)]=t,n().setItem(r,JSON.stringify(i)),t}))},delete:function(e){return Promise.resolve().then((function(){var t=a();delete t[JSON.stringify(e)],n().setItem(r,JSON.stringify(t))}))},clear:function(){return Promise.resolve().then((function(){n().removeItem(r)}))}}}function c(e){var t=a(e.caches),r=t.shift();return void 0===r?{get:function(e,t){var r=arguments.length>2&&void 0!==arguments[2]?arguments[2]:{miss:function(){return Promise.resolve()}};return t().then((function(e){return Promise.all([e,r.miss(e)])})).then((function(e){return i(e,1)[0]}))},set:function(e,t){return Promise.resolve(t)},delete:function(e){return Promise.resolve()},clear:function(){return Promise.resolve()}}:{get:function(e,n){var i=arguments.length>2&&void 0!==arguments[2]?arguments[2]:{miss:function(){return Promise.resolve()}};return r.get(e,n,i).catch((function(){return c({caches:t}).get(e,n,i)}))},set:function(e,n){return r.set(e,n).catch((function(){return c({caches:t}).set(e,n)}))},delete:function(e){return r.delete(e).catch((function(){return c({caches:t}).delete(e)}))},clear:function(){return r.clear().catch((function(){return c({caches:t}).clear()}))}}}function u(){var e=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{serializable:!0},t={};return{get:function(r,n){var i=arguments.length>2&&void 0!==arguments[2]?arguments[2]:{miss:function(){return Promise.resolve()}},a=JSON.stringify(r);if(a in t)return Promise.resolve(e.serializable?JSON.parse(t[a]):t[a]);var s=n(),c=i&&i.miss||function(){return Promise.resolve()};return s.then((function(e){return c(e)})).then((function(){return s}))},set:function(r,n){return t[JSON.stringify(r)]=e.serializable?JSON.stringify(n):n,Promise.resolve(n)},delete:function(e){return delete t[JSON.stringify(e)],Promise.resolve()},clear:function(){return t={},Promise.resolve()}}}function o(e){for(var t=e.length-1;t>0;t--){var r=Math.floor(Math.random()*(t+1)),n=e[t];e[t]=e[r],e[r]=n}return e}function h(e,t){return t?(Object.keys(t).forEach((function(r){e[r]=t[r](e)})),e):e}function f(e){for(var t=arguments.length,r=new Array(t>1?t-1:0),n=1;n0?n:void 0,timeout:r.timeout||t,headers:r.headers||{},queryParameters:r.queryParameters||{},cacheable:r.cacheable}}var d={Read:1,Write:2,Any:3},p=1,v=2,g=3;function y(e){var t=arguments.length>1&&void 0!==arguments[1]?arguments[1]:p;return r(r({},e),{},{status:t,lastUpdate:Date.now()})}function R(e){return"string"==typeof e?{protocol:"https",url:e,accept:d.Any}:{protocol:e.protocol||"https",url:e.url,accept:e.accept||d.Any}}var F="GET",b="POST";function P(e,t){return Promise.all(t.map((function(t){return e.get(t,(function(){return Promise.resolve(y(t))}))}))).then((function(e){var r=e.filter((function(e){return function(e){return e.status===p||Date.now()-e.lastUpdate>12e4}(e)})),n=e.filter((function(e){return function(e){return e.status===g&&Date.now()-e.lastUpdate<=12e4}(e)})),i=[].concat(a(r),a(n));return{getTimeout:function(e,t){return(0===n.length&&0===e?1:n.length+3+e)*t},statelessHosts:i.length>0?i.map((function(e){return R(e)})):t}}))}function j(e,t,n,i){var s=[],c=function(e,t){if(e.method!==F&&(void 0!==e.data||void 0!==t.data)){var n=Array.isArray(e.data)?e.data:r(r({},e.data),t.data);return JSON.stringify(n)}}(n,i),u=function(e,t){var n=r(r({},e.headers),t.headers),i={};return Object.keys(n).forEach((function(e){var t=n[e];i[e.toLowerCase()]=t})),i}(e,i),o=n.method,h=n.method!==F?{}:r(r({},n.data),i.data),f=r(r(r({"x-algolia-agent":e.userAgent.value},e.queryParameters),h),i.queryParameters),l=0,m=function t(r,a){var h=r.pop();if(void 0===h)throw{name:"RetryError",message:"Unreachable hosts - your application id may be incorrect. If the error persists, contact support@algolia.com.",transporterStackTrace:O(s)};var m={data:c,headers:u,method:o,url:E(h,n.path,f),connectTimeout:a(l,e.timeouts.connect),responseTimeout:a(l,i.timeout)},d=function(e){var t={request:m,response:e,host:h,triesLeft:r.length};return s.push(t),t},p={onSuccess:function(e){return function(e){try{return JSON.parse(e.content)}catch(t){throw function(e,t){return{name:"DeserializationError",message:e,response:t}}(t.message,e)}}(e)},onRetry:function(n){var i=d(n);return n.isTimedOut&&l++,Promise.all([e.logger.info("Retryable failure",w(i)),e.hostsCache.set(h,y(h,n.isTimedOut?g:v))]).then((function(){return t(r,a)}))},onFail:function(e){throw d(e),function(e,t){var r=e.content,n=e.status,i=r;try{i=JSON.parse(r).message}catch(e){}return function(e,t,r){return{name:"ApiError",message:e,status:t,transporterStackTrace:r}}(i,n,t)}(e,O(s))}};return e.requester.send(m).then((function(e){return function(e,t){return function(e){var t=e.status;return e.isTimedOut||function(e){var t=e.isTimedOut,r=e.status;return!t&&0==~~r}(e)||2!=~~(t/100)&&4!=~~(t/100)}(e)?t.onRetry(e):2==~~(e.status/100)?t.onSuccess(e):t.onFail(e)}(e,p)}))};return P(e.hostsCache,t).then((function(e){return m(a(e.statelessHosts).reverse(),e.getTimeout)}))}function _(e){var t={value:"Algolia for JavaScript (".concat(e,")"),add:function(e){var r="; ".concat(e.segment).concat(void 0!==e.version?" (".concat(e.version,")"):"");return-1===t.value.indexOf(r)&&(t.value="".concat(t.value).concat(r)),t}};return t}function E(e,t,r){var n=x(r),i="".concat(e.protocol,"://").concat(e.url,"/").concat("/"===t.charAt(0)?t.substr(1):t);return n.length&&(i+="?".concat(n)),i}function x(e){return Object.keys(e).map((function(t){return f("%s=%s",t,(r=e[t],"[object Object]"===Object.prototype.toString.call(r)||"[object Array]"===Object.prototype.toString.call(r)?JSON.stringify(e[t]):e[t]));var r})).join("&")}function O(e){return e.map((function(e){return w(e)}))}function w(e){var t=e.request.headers["x-algolia-api-key"]?{"x-algolia-api-key":"*****"}:{};return r(r({},e),{},{request:r(r({},e.request),{},{headers:r(r({},e.request.headers),t)})})}var N=function(e){var t=e.appId,n=function(e,t,r){var n={"x-algolia-api-key":r,"x-algolia-application-id":t};return{headers:function(){return e===l.WithinHeaders?n:{}},queryParameters:function(){return e===l.WithinQueryParameters?n:{}}}}(void 0!==e.authMode?e.authMode:l.WithinHeaders,t,e.apiKey),a=function(e){var t=e.hostsCache,r=e.logger,n=e.requester,a=e.requestsCache,s=e.responsesCache,c=e.timeouts,u=e.userAgent,o=e.hosts,h=e.queryParameters,f={hostsCache:t,logger:r,requester:n,requestsCache:a,responsesCache:s,timeouts:c,userAgent:u,headers:e.headers,queryParameters:h,hosts:o.map((function(e){return R(e)})),read:function(e,t){var r=m(t,f.timeouts.read),n=function(){return j(f,f.hosts.filter((function(e){return 0!=(e.accept&d.Read)})),e,r)};if(!0!==(void 0!==r.cacheable?r.cacheable:e.cacheable))return n();var a={request:e,mappedRequestOptions:r,transporter:{queryParameters:f.queryParameters,headers:f.headers}};return f.responsesCache.get(a,(function(){return f.requestsCache.get(a,(function(){return f.requestsCache.set(a,n()).then((function(e){return Promise.all([f.requestsCache.delete(a),e])}),(function(e){return Promise.all([f.requestsCache.delete(a),Promise.reject(e)])})).then((function(e){var t=i(e,2);return t[0],t[1]}))}))}),{miss:function(e){return f.responsesCache.set(a,e)}})},write:function(e,t){return j(f,f.hosts.filter((function(e){return 0!=(e.accept&d.Write)})),e,m(t,f.timeouts.write))}};return f}(r(r({hosts:[{url:"".concat(t,"-dsn.algolia.net"),accept:d.Read},{url:"".concat(t,".algolia.net"),accept:d.Write}].concat(o([{url:"".concat(t,"-1.algolianet.com")},{url:"".concat(t,"-2.algolianet.com")},{url:"".concat(t,"-3.algolianet.com")}]))},e),{},{headers:r(r(r({},n.headers()),{"content-type":"application/x-www-form-urlencoded"}),e.headers),queryParameters:r(r({},n.queryParameters()),e.queryParameters)}));return h({transporter:a,appId:t,addAlgoliaAgent:function(e,t){a.userAgent.add({segment:e,version:t})},clearCache:function(){return Promise.all([a.requestsCache.clear(),a.responsesCache.clear()]).then((function(){}))}},e.methods)},A=function(e){return function(t,r){return t.method===F?e.transporter.read(t,r):e.transporter.write(t,r)}},H=function(e){return function(t){var r=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{};return h({transporter:e.transporter,appId:e.appId,indexName:t},r.methods)}},S=function(e){return function(t,n){var i=t.map((function(e){return r(r({},e),{},{params:x(e.params||{})})}));return e.transporter.read({method:b,path:"1/indexes/*/queries",data:{requests:i},cacheable:!0},n)}},T=function(e){return function(t,i){return Promise.all(t.map((function(t){var a=t.params,s=a.facetName,c=a.facetQuery,u=n(a,["facetName","facetQuery"]);return H(e)(t.indexName,{methods:{searchForFacetValues:k}}).searchForFacetValues(s,c,r(r({},i),u))})))}},Q=function(e){return function(t,r,n){return e.transporter.read({method:b,path:f("1/answers/%s/prediction",e.indexName),data:{query:t,queryLanguages:r},cacheable:!0},n)}},C=function(e){return function(t,r){return e.transporter.read({method:b,path:f("1/indexes/%s/query",e.indexName),data:{query:t},cacheable:!0},r)}},k=function(e){return function(t,r,n){return e.transporter.read({method:b,path:f("1/indexes/%s/facets/%s/query",e.indexName,t),data:{facetQuery:r},cacheable:!0},n)}},I=1,D=2,q=3;function V(e,t,n){var i,a={appId:e,apiKey:t,timeouts:{connect:1,read:2,write:30},requester:{send:function(e){return new Promise((function(t){var r=new XMLHttpRequest;r.open(e.method,e.url,!0),Object.keys(e.headers).forEach((function(t){return r.setRequestHeader(t,e.headers[t])}));var n,i=function(e,n){return setTimeout((function(){r.abort(),t({status:0,content:n,isTimedOut:!0})}),1e3*e)},a=i(e.connectTimeout,"Connection timeout");r.onreadystatechange=function(){r.readyState>r.OPENED&&void 0===n&&(clearTimeout(a),n=i(e.responseTimeout,"Socket timeout"))},r.onerror=function(){0===r.status&&(clearTimeout(a),clearTimeout(n),t({content:r.responseText||"Network request failed",status:r.status,isTimedOut:!1}))},r.onload=function(){clearTimeout(a),clearTimeout(n),t({content:r.responseText,status:r.status,isTimedOut:!1})},r.send(e.data)}))}},logger:(i=q,{debug:function(e,t){return I>=i&&console.debug(e,t),Promise.resolve()},info:function(e,t){return D>=i&&console.info(e,t),Promise.resolve()},error:function(e,t){return console.error(e,t),Promise.resolve()}}),responsesCache:u(),requestsCache:u({serializable:!1}),hostsCache:c({caches:[s({key:"".concat("4.17.2","-").concat(e)}),u()]}),userAgent:_("4.17.2").add({segment:"Browser",version:"lite"}),authMode:l.WithinQueryParameters};return N(r(r(r({},a),n),{},{methods:{search:S,searchForFacetValues:T,multipleQueries:S,multipleSearchForFacetValues:T,customRequest:A,initIndex:function(e){return function(t){return H(e)(t,{methods:{search:C,searchForFacetValues:k,findAnswers:Q}})}}}}))}return V.version="4.17.2",V}()},56675:(e,t,r)=>{"use strict";r.r(t),r.d(t,{default:()=>A});var n=r(67294),i=r(86010),a=r(8131),s=r.n(a),c=r(70290),u=r.n(c),o=r(10412),h=r(35742),f=r(39960),l=r(80143),m=r(52263),d=["zero","one","two","few","many","other"];function p(e){return d.filter((function(t){return e.includes(t)}))}var v={locale:"en",pluralForms:p(["one","other"]),select:function(e){return 1===e?"one":"other"}};function g(){var e=(0,m.Z)().i18n.currentLocale;return(0,n.useMemo)((function(){try{return t=e,r=new Intl.PluralRules(t),{locale:t,pluralForms:p(r.resolvedOptions().pluralCategories),select:function(e){return r.select(e)}}}catch(n){return console.error('Failed to use Intl.PluralRules for locale "'+e+'".\nDocusaurus will fallback to the default (English) implementation.\nError: '+n.message+"\n"),v}var t,r}),[e])}function y(){var e=g();return{selectMessage:function(t,r){return function(e,t,r){var n=e.split("|");if(1===n.length)return n[0];n.length>r.pluralForms.length&&console.error("For locale="+r.locale+", a maximum of "+r.pluralForms.length+" plural forms are expected ("+r.pluralForms.join(",")+"), but the message contains "+n.length+": "+e);var i=r.select(t),a=r.pluralForms.indexOf(i);return n[Math.min(a,n.length-1)]}(r,t,e)}}}var R=r(66177),F=r(69688),b=r(10833),P=r(82128),j=r(95999),_=r(6278),E=r(239),x=r(7452);const O={searchQueryInput:"searchQueryInput_u2C7",searchVersionInput:"searchVersionInput_m0Ui",searchResultsColumn:"searchResultsColumn_JPFH",algoliaLogo:"algoliaLogo_rT1R",algoliaLogoPathFill:"algoliaLogoPathFill_WdUC",searchResultItem:"searchResultItem_Tv2o",searchResultItemHeading:"searchResultItemHeading_KbCB",searchResultItemPath:"searchResultItemPath_lhe1",searchResultItemSummary:"searchResultItemSummary_AEaO",searchQueryColumn:"searchQueryColumn_RTkw",searchVersionColumn:"searchVersionColumn_ypXd",searchLogoColumn:"searchLogoColumn_rJIA",loadingSpinner:"loadingSpinner_XVxU","loading-spin":"loading-spin_vzvp",loader:"loader_vvXV"};function w(e){var t=e.docsSearchVersionsHelpers,r=Object.entries(t.allDocsData).filter((function(e){return e[1].versions.length>1}));return n.createElement("div",{className:(0,i.Z)("col","col--3","padding-left--none",O.searchVersionColumn)},r.map((function(e){var i=e[0],a=e[1],s=r.length>1?i+": ":"";return n.createElement("select",{key:i,onChange:function(e){return t.setSearchVersion(i,e.target.value)},defaultValue:t.searchVersions[i],className:O.searchVersionInput},a.versions.map((function(e,t){return n.createElement("option",{key:t,label:""+s+e.label,value:e.name})})))})))}function N(){var e,t,r,a,c,d,p=(0,m.Z)().i18n.currentLocale,v=(0,_.L)().algolia,g=v.appId,b=v.apiKey,N=v.indexName,A=(0,E.l)(),H=(e=y().selectMessage,function(t){return e(t,(0,j.I)({id:"theme.SearchPage.documentsFound.plurals",description:'Pluralized label for "{count} documents found". Use as much plural forms (separated by "|") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)',message:"One document found|{count} documents found"},{count:t}))}),S=(t=(0,l._r)(),r=(0,n.useState)((function(){return Object.entries(t).reduce((function(e,t){var r,n=t[0],i=t[1];return Object.assign({},e,((r={})[n]=i.versions[0].name,r))}),{})})),a=r[0],c=r[1],d=Object.values(t).some((function(e){return e.versions.length>1})),{allDocsData:t,versioningEnabled:d,searchVersions:a,setSearchVersion:function(e,t){return c((function(r){var n;return Object.assign({},r,((n={})[e]=t,n))}))}}),T=(0,R.K)(),Q=T[0],C=T[1],k={items:[],query:null,totalResults:null,totalPages:null,lastPage:null,hasMore:null,loading:null},I=(0,n.useReducer)((function(e,t){switch(t.type){case"reset":return k;case"loading":return Object.assign({},e,{loading:!0});case"update":return Q!==t.value.query?e:Object.assign({},t.value,{items:0===t.value.lastPage?t.value.items:e.items.concat(t.value.items)});case"advance":var r=e.totalPages>e.lastPage+1;return Object.assign({},e,{lastPage:r?e.lastPage+1:e.lastPage,hasMore:r});default:return e}}),k),D=I[0],q=I[1],V=u()(g,b),L=s()(V,N,{hitsPerPage:15,advancedSyntax:!0,disjunctiveFacets:["language","docusaurus_tag"]});L.on("result",(function(e){var t=e.results,r=t.query,n=t.hits,i=t.page,a=t.nbHits,s=t.nbPages;if(""!==r&&Array.isArray(n)){var c=function(e){return e.replace(/algolia-docsearch-suggestion--highlight/g,"search-result-match")},u=n.map((function(e){var t=e.url,r=e._highlightResult.hierarchy,n=e._snippetResult,i=void 0===n?{}:n,a=Object.keys(r).map((function(e){return c(r[e].value)}));return{title:a.pop(),url:A(t),summary:i.content?c(i.content.value)+"...":"",breadcrumbs:a}}));q({type:"update",value:{items:u,query:r,totalResults:a,totalPages:s,lastPage:i,hasMore:s>i+1,loading:!1}})}else q({type:"reset"})}));var B=(0,n.useState)(null),z=B[0],M=B[1],J=(0,n.useRef)(0),W=(0,n.useRef)(o.Z.canUseIntersectionObserver&&new IntersectionObserver((function(e){var t=e[0],r=t.isIntersecting,n=t.boundingClientRect.y;r&&J.current>n&&q({type:"advance"}),J.current=n}),{threshold:1})),U=function(){return Q?(0,j.I)({id:"theme.SearchPage.existingResultsTitle",message:'Search results for "{query}"',description:"The search page title for non-empty query"},{query:Q}):(0,j.I)({id:"theme.SearchPage.emptyResultsTitle",message:"Search the documentation",description:"The search page title for empty query"})},Z=(0,F.zX)((function(e){void 0===e&&(e=0),L.addDisjunctiveFacetRefinement("docusaurus_tag","default"),L.addDisjunctiveFacetRefinement("language",p),Object.entries(S.searchVersions).forEach((function(e){var t=e[0],r=e[1];L.addDisjunctiveFacetRefinement("docusaurus_tag","docs-"+t+"-"+r)})),L.setQuery(Q).setPage(e).search()}));return(0,n.useEffect)((function(){if(z){var e=W.current;return e?(e.observe(z),function(){return e.unobserve(z)}):function(){return!0}}}),[z]),(0,n.useEffect)((function(){q({type:"reset"}),Q&&(q({type:"loading"}),setTimeout((function(){Z()}),300))}),[Q,S.searchVersions,Z]),(0,n.useEffect)((function(){D.lastPage&&0!==D.lastPage&&Z(D.lastPage)}),[Z,D.lastPage]),n.createElement(x.Z,null,n.createElement(h.Z,null,n.createElement("title",null,(0,P.p)(U())),n.createElement("meta",{property:"robots",content:"noindex, follow"})),n.createElement("div",{className:"container margin-vert--lg"},n.createElement("h1",null,U()),n.createElement("form",{className:"row",onSubmit:function(e){return e.preventDefault()}},n.createElement("div",{className:(0,i.Z)("col",O.searchQueryColumn,{"col--9":S.versioningEnabled,"col--12":!S.versioningEnabled})},n.createElement("input",{type:"search",name:"q",className:O.searchQueryInput,placeholder:(0,j.I)({id:"theme.SearchPage.inputPlaceholder",message:"Type your search here",description:"The placeholder for search page input"}),"aria-label":(0,j.I)({id:"theme.SearchPage.inputLabel",message:"Search",description:"The ARIA label for search page input"}),onChange:function(e){return C(e.target.value)},value:Q,autoComplete:"off",autoFocus:!0})),S.versioningEnabled&&n.createElement(w,{docsSearchVersionsHelpers:S})),n.createElement("div",{className:"row"},n.createElement("div",{className:(0,i.Z)("col","col--8",O.searchResultsColumn)},!!D.totalResults&&H(D.totalResults)),n.createElement("div",{className:(0,i.Z)("col","col--4","text--right",O.searchLogoColumn)},n.createElement("a",{target:"_blank",rel:"noopener noreferrer",href:"https://www.algolia.com/","aria-label":(0,j.I)({id:"theme.SearchPage.algoliaLabel",message:"Search by Algolia",description:"The ARIA label for Algolia mention"})},n.createElement("svg",{viewBox:"0 0 168 24",className:O.algoliaLogo},n.createElement("g",{fill:"none"},n.createElement("path",{className:O.algoliaLogoPathFill,d:"M120.925 18.804c-4.386.02-4.386-3.54-4.386-4.106l-.007-13.336 2.675-.424v13.254c0 .322 0 2.358 1.718 2.364v2.248zm-10.846-2.18c.821 0 1.43-.047 1.855-.129v-2.719a6.334 6.334 0 0 0-1.574-.199 5.7 5.7 0 0 0-.897.069 2.699 2.699 0 0 0-.814.24c-.24.116-.439.28-.582.491-.15.212-.219.335-.219.656 0 .628.219.991.616 1.23s.938.362 1.615.362zm-.233-9.7c.883 0 1.629.109 2.231.328.602.218 1.088.525 1.444.915.363.396.609.922.76 1.483.157.56.232 1.175.232 1.85v6.874a32.5 32.5 0 0 1-1.868.314c-.834.123-1.772.185-2.813.185-.69 0-1.327-.069-1.895-.198a4.001 4.001 0 0 1-1.471-.636 3.085 3.085 0 0 1-.951-1.134c-.226-.465-.343-1.12-.343-1.803 0-.656.13-1.073.384-1.525a3.24 3.24 0 0 1 1.047-1.106c.445-.287.95-.492 1.532-.615a8.8 8.8 0 0 1 1.82-.185 8.404 8.404 0 0 1 1.972.24v-.438c0-.307-.035-.6-.11-.874a1.88 1.88 0 0 0-.384-.73 1.784 1.784 0 0 0-.724-.493 3.164 3.164 0 0 0-1.143-.205c-.616 0-1.177.075-1.69.164a7.735 7.735 0 0 0-1.26.307l-.321-2.192c.335-.117.834-.233 1.478-.349a10.98 10.98 0 0 1 2.073-.178zm52.842 9.626c.822 0 1.43-.048 1.854-.13V13.7a6.347 6.347 0 0 0-1.574-.199c-.294 0-.595.021-.896.069a2.7 2.7 0 0 0-.814.24 1.46 1.46 0 0 0-.582.491c-.15.212-.218.335-.218.656 0 .628.218.991.615 1.23.404.245.938.362 1.615.362zm-.226-9.694c.883 0 1.629.108 2.231.327.602.219 1.088.526 1.444.915.355.39.609.923.759 1.483a6.8 6.8 0 0 1 .233 1.852v6.873c-.41.088-1.034.19-1.868.314-.834.123-1.772.184-2.813.184-.69 0-1.327-.068-1.895-.198a4.001 4.001 0 0 1-1.471-.635 3.085 3.085 0 0 1-.951-1.134c-.226-.465-.343-1.12-.343-1.804 0-.656.13-1.073.384-1.524.26-.45.608-.82 1.047-1.107.445-.286.95-.491 1.532-.614a8.803 8.803 0 0 1 2.751-.13c.329.034.671.096 1.04.185v-.437a3.3 3.3 0 0 0-.109-.875 1.873 1.873 0 0 0-.384-.731 1.784 1.784 0 0 0-.724-.492 3.165 3.165 0 0 0-1.143-.205c-.616 0-1.177.075-1.69.164a7.75 7.75 0 0 0-1.26.307l-.321-2.193c.335-.116.834-.232 1.478-.348a11.633 11.633 0 0 1 2.073-.177zm-8.034-1.271a1.626 1.626 0 0 1-1.628-1.62c0-.895.725-1.62 1.628-1.62.904 0 1.63.725 1.63 1.62 0 .895-.733 1.62-1.63 1.62zm1.348 13.22h-2.689V7.27l2.69-.423v11.956zm-4.714 0c-4.386.02-4.386-3.54-4.386-4.107l-.008-13.336 2.676-.424v13.254c0 .322 0 2.358 1.718 2.364v2.248zm-8.698-5.903c0-1.156-.253-2.119-.746-2.788-.493-.677-1.183-1.01-2.067-1.01-.882 0-1.574.333-2.065 1.01-.493.676-.733 1.632-.733 2.788 0 1.168.246 1.953.74 2.63.492.683 1.183 1.018 2.066 1.018.882 0 1.574-.342 2.067-1.019.492-.683.738-1.46.738-2.63zm2.737-.007c0 .902-.13 1.584-.397 2.33a5.52 5.52 0 0 1-1.128 1.906 4.986 4.986 0 0 1-1.752 1.223c-.685.286-1.739.45-2.265.45-.528-.006-1.574-.157-2.252-.45a5.096 5.096 0 0 1-1.744-1.223c-.487-.527-.863-1.162-1.137-1.906a6.345 6.345 0 0 1-.41-2.33c0-.902.123-1.77.397-2.508a5.554 5.554 0 0 1 1.15-1.892 5.133 5.133 0 0 1 1.75-1.216c.679-.287 1.425-.423 2.232-.423.808 0 1.553.142 2.237.423a4.88 4.88 0 0 1 1.753 1.216 5.644 5.644 0 0 1 1.135 1.892c.287.738.431 1.606.431 2.508zm-20.138 0c0 1.12.246 2.363.738 2.882.493.52 1.13.78 1.91.78.424 0 .828-.062 1.204-.178.377-.116.677-.253.917-.417V9.33a10.476 10.476 0 0 0-1.766-.226c-.971-.028-1.71.37-2.23 1.004-.513.636-.773 1.75-.773 2.788zm7.438 5.274c0 1.824-.466 3.156-1.404 4.004-.936.846-2.367 1.27-4.296 1.27-.705 0-2.17-.137-3.34-.396l.431-2.118c.98.205 2.272.26 2.95.26 1.074 0 1.84-.219 2.299-.656.459-.437.684-1.086.684-1.948v-.437a8.07 8.07 0 0 1-1.047.397c-.43.13-.93.198-1.492.198-.739 0-1.41-.116-2.018-.349a4.206 4.206 0 0 1-1.567-1.025c-.431-.45-.774-1.017-1.013-1.694-.24-.677-.363-1.885-.363-2.773 0-.834.13-1.88.384-2.577.26-.696.629-1.298 1.129-1.796.493-.498 1.095-.881 1.8-1.162a6.605 6.605 0 0 1 2.428-.457c.87 0 1.67.109 2.45.24.78.129 1.444.265 1.985.415V18.17zM6.972 6.677v1.627c-.712-.446-1.52-.67-2.425-.67-.585 0-1.045.13-1.38.391a1.24 1.24 0 0 0-.502 1.03c0 .425.164.765.494 1.02.33.256.835.532 1.516.83.447.192.795.356 1.045.495.25.138.537.332.862.582.324.25.563.548.718.894.154.345.23.741.23 1.188 0 .947-.334 1.691-1.004 2.234-.67.542-1.537.814-2.601.814-1.18 0-2.16-.229-2.936-.686v-1.708c.84.628 1.814.942 2.92.942.585 0 1.048-.136 1.388-.407.34-.271.51-.646.51-1.125 0-.287-.1-.55-.302-.79-.203-.24-.42-.42-.655-.542-.234-.123-.585-.29-1.053-.503a61.27 61.27 0 0 1-.582-.271 13.67 13.67 0 0 1-.55-.287 4.275 4.275 0 0 1-.567-.351 6.92 6.92 0 0 1-.455-.4c-.18-.17-.31-.34-.39-.51-.08-.17-.155-.37-.224-.598a2.553 2.553 0 0 1-.104-.742c0-.915.333-1.638.998-2.17.664-.532 1.523-.798 2.576-.798.968 0 1.793.17 2.473.51zm7.468 5.696v-.287c-.022-.607-.187-1.088-.495-1.444-.309-.357-.75-.535-1.324-.535-.532 0-.99.194-1.373.583-.382.388-.622.949-.717 1.683h3.909zm1.005 2.792v1.404c-.596.34-1.383.51-2.362.51-1.255 0-2.255-.377-3-1.132-.744-.755-1.116-1.744-1.116-2.968 0-1.297.34-2.316 1.021-3.055.68-.74 1.548-1.11 2.6-1.11 1.033 0 1.852.323 2.458.966.606.644.91 1.572.91 2.784 0 .33-.033.676-.096 1.038h-5.314c.107.702.405 1.239.894 1.611.49.372 1.106.558 1.85.558.862 0 1.58-.202 2.155-.606zm6.605-1.77h-1.212c-.596 0-1.045.116-1.349.35-.303.234-.454.532-.454.894 0 .372.117.664.35.877.235.213.575.32 1.022.32.51 0 .912-.142 1.204-.424.293-.281.44-.651.44-1.108v-.91zm-4.068-2.554V9.325c.627-.361 1.457-.542 2.489-.542 2.116 0 3.175 1.026 3.175 3.08V17h-1.548v-.957c-.415.68-1.143 1.02-2.186 1.02-.766 0-1.38-.22-1.843-.661-.462-.442-.694-1.003-.694-1.684 0-.776.293-1.38.878-1.81.585-.431 1.404-.647 2.457-.647h1.34V11.8c0-.554-.133-.971-.399-1.253-.266-.282-.707-.423-1.324-.423a4.07 4.07 0 0 0-2.345.718zm9.333-1.93v1.42c.394-1 1.101-1.5 2.123-1.5.148 0 .313.016.494.048v1.531a1.885 1.885 0 0 0-.75-.143c-.542 0-.989.24-1.34.718-.351.479-.527 1.048-.527 1.707V17h-1.563V8.91h1.563zm5.01 4.084c.022.82.272 1.492.75 2.019.479.526 1.15.79 2.01.79.639 0 1.235-.176 1.788-.527v1.404c-.521.319-1.186.479-1.995.479-1.265 0-2.276-.4-3.031-1.197-.755-.798-1.133-1.792-1.133-2.984 0-1.16.38-2.151 1.14-2.975.761-.825 1.79-1.237 3.088-1.237.702 0 1.346.149 1.93.447v1.436a3.242 3.242 0 0 0-1.77-.495c-.84 0-1.513.266-2.019.798-.505.532-.758 1.213-.758 2.042zM40.24 5.72v4.579c.458-1 1.293-1.5 2.505-1.5.787 0 1.42.245 1.899.734.479.49.718 1.17.718 2.042V17h-1.564v-5.106c0-.553-.14-.98-.422-1.284-.282-.303-.652-.455-1.11-.455-.531 0-1.002.202-1.411.606-.41.405-.615 1.022-.615 1.851V17h-1.563V5.72h1.563zm14.966 10.02c.596 0 1.096-.253 1.5-.758.404-.506.606-1.157.606-1.955 0-.915-.202-1.62-.606-2.114-.404-.495-.92-.742-1.548-.742-.553 0-1.05.224-1.491.67-.442.447-.662 1.133-.662 2.058 0 .958.212 1.67.638 2.138.425.469.946.703 1.563.703zM53.004 5.72v4.42c.574-.894 1.388-1.341 2.44-1.341 1.022 0 1.857.383 2.506 1.149.649.766.973 1.781.973 3.047 0 1.138-.309 2.109-.925 2.912-.617.803-1.463 1.205-2.537 1.205-1.075 0-1.894-.447-2.457-1.34V17h-1.58V5.72h1.58zm9.908 11.104l-3.223-7.913h1.739l1.005 2.632 1.26 3.415c.096-.32.48-1.458 1.15-3.415l.909-2.632h1.66l-2.92 7.866c-.777 2.074-1.963 3.11-3.559 3.11a2.92 2.92 0 0 1-.734-.079v-1.34c.17.042.351.064.543.064 1.032 0 1.755-.57 2.17-1.708z"}),n.createElement("path",{fill:"#5468FF",d:"M78.988.938h16.594a2.968 2.968 0 0 1 2.966 2.966V20.5a2.967 2.967 0 0 1-2.966 2.964H78.988a2.967 2.967 0 0 1-2.966-2.964V3.897A2.961 2.961 0 0 1 78.988.938z"}),n.createElement("path",{fill:"white",d:"M89.632 5.967v-.772a.978.978 0 0 0-.978-.977h-2.28a.978.978 0 0 0-.978.977v.793c0 .088.082.15.171.13a7.127 7.127 0 0 1 1.984-.28c.65 0 1.295.088 1.917.259.082.02.164-.04.164-.13m-6.248 1.01l-.39-.389a.977.977 0 0 0-1.382 0l-.465.465a.973.973 0 0 0 0 1.38l.383.383c.062.061.15.047.205-.014.226-.307.472-.601.746-.874.281-.28.568-.526.883-.751.068-.042.075-.137.02-.2m4.16 2.453v3.341c0 .096.104.165.192.117l2.97-1.537c.068-.034.089-.117.055-.184a3.695 3.695 0 0 0-3.08-1.866c-.068 0-.136.054-.136.13m0 8.048a4.489 4.489 0 0 1-4.49-4.482 4.488 4.488 0 0 1 4.49-4.482 4.488 4.488 0 0 1 4.489 4.482 4.484 4.484 0 0 1-4.49 4.482m0-10.85a6.363 6.363 0 1 0 0 12.729 6.37 6.37 0 0 0 6.372-6.368 6.358 6.358 0 0 0-6.371-6.36"})))))),D.items.length>0?n.createElement("main",null,D.items.map((function(e,t){var r=e.title,a=e.url,s=e.summary,c=e.breadcrumbs;return n.createElement("article",{key:t,className:O.searchResultItem},n.createElement("h2",{className:O.searchResultItemHeading},n.createElement(f.Z,{to:a,dangerouslySetInnerHTML:{__html:r}})),c.length>0&&n.createElement("nav",{"aria-label":"breadcrumbs"},n.createElement("ul",{className:(0,i.Z)("breadcrumbs",O.searchResultItemPath)},c.map((function(e,t){return n.createElement("li",{key:t,className:"breadcrumbs__item",dangerouslySetInnerHTML:{__html:e}})})))),s&&n.createElement("p",{className:O.searchResultItemSummary,dangerouslySetInnerHTML:{__html:s}}))}))):[Q&&!D.loading&&n.createElement("p",{key:"no-results"},n.createElement(j.Z,{id:"theme.SearchPage.noResultsText",description:"The paragraph for empty search result"},"No results were found")),!!D.loading&&n.createElement("div",{key:"spinner",className:O.loadingSpinner})],D.hasMore&&n.createElement("div",{className:O.loader,ref:M},n.createElement(j.Z,{id:"theme.SearchPage.fetchingNewResults",description:"The paragraph for fetching new search results"},"Fetching new results..."))))}function A(){return n.createElement(b.FG,{className:"search-page-wrapper"},n.createElement(N,null))}}}]); \ No newline at end of file diff --git a/assets/js/1a4e3797.b5f1ebc2.js.LICENSE.txt b/assets/js/1a4e3797.c063a301.js.LICENSE.txt similarity index 100% rename from assets/js/1a4e3797.b5f1ebc2.js.LICENSE.txt rename to assets/js/1a4e3797.c063a301.js.LICENSE.txt diff --git a/assets/js/runtime~main.d42f0a3c.js b/assets/js/runtime~main.949cc1c3.js similarity index 99% rename from assets/js/runtime~main.d42f0a3c.js rename to assets/js/runtime~main.949cc1c3.js index 21bf7c3b3d2..c5fe12932d3 100644 --- a/assets/js/runtime~main.d42f0a3c.js +++ b/assets/js/runtime~main.949cc1c3.js @@ -1 +1 @@ -(()=>{"use strict";var e,b,f,c,a,d={},t={};function r(e){var b=t[e];if(void 0!==b)return b.exports;var f=t[e]={id:e,loaded:!1,exports:{}};return d[e].call(f.exports,f,f.exports,r),f.loaded=!0,f.exports}r.m=d,r.c=t,e=[],r.O=(b,f,c,a)=>{if(!f){var d=1/0;for(i=0;i=a)&&Object.keys(r.O).every((e=>r.O[e](f[o])))?f.splice(o--,1):(t=!1,a0&&e[i-1][2]>a;i--)e[i]=e[i-1];e[i]=[f,c,a]},r.n=e=>{var b=e&&e.__esModule?()=>e.default:()=>e;return r.d(b,{a:b}),b},f=Object.getPrototypeOf?e=>Object.getPrototypeOf(e):e=>e.__proto__,r.t=function(e,c){if(1&c&&(e=this(e)),8&c)return e;if("object"==typeof e&&e){if(4&c&&e.__esModule)return e;if(16&c&&"function"==typeof e.then)return e}var a=Object.create(null);r.r(a);var d={};b=b||[null,f({}),f([]),f(f)];for(var t=2&c&&e;"object"==typeof t&&!~b.indexOf(t);t=f(t))Object.getOwnPropertyNames(t).forEach((b=>d[b]=()=>e[b]));return d.default=()=>e,r.d(a,d),a},r.d=(e,b)=>{for(var f in b)r.o(b,f)&&!r.o(e,f)&&Object.defineProperty(e,f,{enumerable:!0,get:b[f]})},r.f={},r.e=e=>Promise.all(Object.keys(r.f).reduce(((b,f)=>(r.f[f](e,b),b)),[])),r.u=e=>"assets/js/"+({19:"906e49ec",21:"f5e3827c",71:"ab971afc",99:"49c587c2",172:"21730a31",224:"a93c3367",250:"23d30d6b",291:"5da0ca7c",467:"a5bcb3f1",513:"9216ce7b",596:"b564874a",803:"c63e6bd5",899:"54d8bddc",1116:"0109100f",1365:"66ffc608",1387:"be2f7876",1523:"ef01e1dd",1647:"7981506b",1652:"902d2d1d",1664:"9ecb4d01",1671:"eee57cd1",1940:"a971b35f",2044:"0149cacd",2097:"40d51a61",2196:"e1e17943",2312:"ac4bed99",2427:"fa423b6e",2638:"935116ff",2656:"80631bfd",2905:"60b67194",2916:"7bb83d6b",2989:"9ef1e345",3044:"3bedcc76",3102:"7174660f",3145:"d4d22ad8",3191:"d0a0235c",3197:"ec28562d",3216:"92b043a3",3283:"0b092b5c",3326:"5e94ba2e",3397:"b7343c9b",3398:"1e070b7c",3650:"020a22ba",3667:"6c1d24e1",3914:"7b7fec6b",3919:"c81517c7",4125:"feba251b",4151:"5017cef7",4195:"3c93ed7e",4244:"21cfb395",4328:"38680a69",4504:"f28093e3",4513:"ffa15017",4585:"f2dc10f7",4631:"9654b394",4874:"a3db1255",4882:"4482beb5",4929:"75600d79",5061:"e54b1e77",5129:"d87811ce",5313:"aa4fa4fb",5352:"622596e9",5383:"f2a3bf8e",5512:"631dea17",5714:"d16a2606",5909:"66e9ea68",5920:"d613e1f8",5981:"85e709dc",6027:"391378fa",6151:"39579801",6386:"a8ef1ed2",6443:"5b4a63ac",6517:"31c3e3d7",6537:"0c99e969",6553:"efc338fe",6695:"111e23e1",6734:"85954f48",6799:"d1284c82",6822:"f49551b9",6824:"cc519fb4",6968:"9e6b2559",6971:"f38fa80d",6978:"7d9c461e",7078:"ff96de6e",7091:"bd1a8573",7092:"30a13577",7108:"365726b0",7120:"7e91f3e1",7155:"bb4987bb",7162:"97ce6959",7318:"b7e69c77",7451:"d8c5fc94",7485:"81f033b8",7500:"bd0e022f",7874:"32d13eb8",8023:"43de05a8",8135:"7d280bdc",8145:"93015a15",8188:"39bddd84",8210:"c565b8da",8313:"d163ea32",8328:"fa8af309",8407:"b4473d93",8482:"f983631a",8638:"2b8a5969",8671:"1b93ff3d",8809:"aa01ca6a",8882:"6fdd5bc4",8906:"6d2c1101",9028:"de8a7b18",9119:"debbc0e2",9225:"e5523a26",9235:"407bcc70",9365:"541bc80d",9444:"0ffc31bc",9542:"cf4d312e",9550:"36edbaa2",9615:"9db3bdac",9817:"14eb3368",9836:"13af1bdb",9907:"bfd6b54b",9947:"7097fbbc",10109:"70cd875c",10228:"cdebfca4",10270:"a7da438d",10436:"05fa5837",10497:"6773ef05",10650:"e8b1baf4",10918:"caf7e36c",10987:"26e1978a",11174:"ba17e21b",11203:"c98c0daa",11311:"2913cae6",11321:"a15a0d8e",11326:"65031edd",11342:"4d2bb41f",11398:"ba5e62dd",11656:"885bf670",11875:"3d446fd0",12228:"b63d08bc",12442:"1fb2401b",12549:"31eb4af1",12555:"885da4ef",12558:"58ac1d26",12560:"a6d8b730",12567:"5aabd190",13253:"b48b6b77",13280:"f65f22ef",13351:"6d905346",13460:"70808dd0",13588:"55920b47",13595:"3be6e3bd",13617:"c642f758",13718:"1ac29206",13896:"8bd7a1a3",13924:"bd0b26a5",13979:"911bbfa4",13995:"0b0df062",14061:"3105dae0",14088:"03902e07",14095:"31585cea",14143:"f71ac404",14299:"af9acd56",14369:"c099652b",14386:"83cbebfb",14396:"d3fe7aed",14549:"79db63f1",14610:"40a26966",14670:"9dd89af2",14713:"763f2b13",14840:"39ed38cd",14908:"f6310963",15196:"4338ab08",15497:"f9c66408",15658:"a97b7821",15888:"a466ebec",15970:"271906a0",15994:"26134010",16022:"21edad34",16038:"62127933",16058:"891a9b8f",16071:"6ecc8728",16153:"7d0b3c01",16161:"ff3504dd",16379:"66fe7120",16528:"fdbb9241",16635:"a9347149",16672:"e1bbb98e",16685:"46551803",16876:"df878b79",16973:"251d94d8",17275:"fe34d639",17283:"45e19d44",17457:"f77885f5",17511:"528fc62e",17726:"c003d460",17757:"00c88225",17785:"3e697946",17883:"b530e783",17887:"996d98f3",17989:"9a71d807",18025:"7f9f61f2",18050:"abc9098e",18084:"29dde6c8",18100:"d19aead5",18143:"2730c631",18156:"5bebce7d",18186:"074a0372",18318:"4e07c49f",18559:"18ccacf6",18734:"07645771",18746:"8282a203",18883:"31793acc",18892:"4d58aa3f",18928:"6e11cc87",18998:"bc4716d5",19177:"584b298a",19204:"79617745",19212:"86c7426e",19305:"8a064b88",19408:"b7ec56b9",19427:"126e88af",19493:"6f49328c",19504:"83fe529b",19531:"974829b4",19625:"84eafbb4",19671:"3729e987",19709:"c5593510",19733:"2edcde3e",19806:"dc7ad1ac",19832:"17f9c41b",19876:"760eed0f",19939:"8781c463",19962:"6bf1075e",20040:"b73dbab9",20061:"ef0f9e32",20169:"c9664647",20303:"9b7bae35",20602:"1d014bb1",20689:"cbbe4dac",20707:"120dd2fd",20764:"6d0dfc8d",20911:"be3ddcfb",20917:"e01a2739",20983:"4d0df69e",21015:"653c19c7",21134:"ae5e6a48",21143:"6a89e0dd",21190:"95169675",21207:"0e728709",21228:"0feddf78",21379:"e22055a4",21643:"f18d5795",21688:"aa282e34",21823:"c0ef9e49",21983:"7f2bec55",22030:"43a49a39",22087:"af199e5b",22129:"06ceb223",22163:"9bc49845",22238:"1ca03b4b",22456:"0b0f030b",22523:"ef2624d8",22540:"0e8c522c",22583:"66b5d69c",22604:"a8f480dd",22777:"4995f874",22898:"6775be7c",22940:"fe423ebe",22997:"9e91305d",23064:"500c9b63",23228:"ede8882f",23231:"909cadf6",23252:"8113fd14",23310:"c1f9ba1e",23320:"89c49d10",23343:"57b7b037",23435:"23896e06",23522:"2c9f485d",23536:"edbf4496",23545:"2457e7c2",23663:"c7599d12",23714:"332c497c",23804:"99961c3d",23898:"9a02f8a7",24058:"f1d5089f",24066:"92ce2bd2",24101:"15d86f95",24109:"0c48ef63",24158:"b7738a69",24266:"a8565f1f",24282:"395508da",24401:"fbfa5dfc",24467:"7fc9e2ed",24501:"e6de5f28",24946:"fd378320",24986:"7d30361b",25079:"10fd89ee",25251:"6c0ce6d0",25283:"8b2f7dd6",25427:"2b1e7b76",25433:"b9b67b35",25451:"59740e69",25513:"f265d6a5",25547:"06673fe1",25579:"75071a94",25833:"9427c683",25898:"bd61737f",26067:"cc7818bb",26084:"85860bdc",26086:"21996883",26201:"a958884d",26291:"3bed40a0",26311:"f35b8c8b",26521:"ec205789",26654:"08985b86",26686:"5667bf50",26695:"3be4d1c2",26858:"ceb6bd62",27109:"bb1d1845",27167:"6d03c6cb",27270:"e022cd8b",27276:"ec2b56b1",27303:"ec11103c",27324:"552bb95e",27554:"92307374",27704:"0f6a2fca",27918:"17896441",27982:"865c04d0",28085:"56405cb8",28134:"916fb87b",28139:"95771e39",28219:"50e78136",28261:"9ce40ebc",28367:"7f6814ed",28475:"fc338eb2",28476:"2f4d1edb",28514:"db082e36",28516:"8af04d56",28623:"018243f8",28699:"4a0c84c3",28800:"f6ca5dc0",28880:"2f74be58",28882:"8d83f575",28906:"48c7b3a1",28922:"3417a016",29014:"da9049b8",29025:"8f32218b",29050:"44573fa4",29066:"e5977951",29131:"a670ed1c",29191:"6fe0ccd0",29272:"f5da8015",29514:"1be78505",29520:"4ef1f024",29698:"949a554a",29717:"cff5e41a",29782:"16a52e74",29818:"9e530f0a",29831:"3291c538",29864:"b604d5b2",29871:"729f5dd4",29886:"2c86cbaa",29899:"14e9211b",29978:"3d8cf439",29980:"c32e37fe",30062:"26db341a",30216:"04829abe",30295:"ff9e51b7",30419:"26bc6c41",30433:"1dc72111",30454:"019a0579",30470:"73c32a6c",30589:"10b7b761",30677:"4f643cbc",30678:"8dc6ea19",30800:"534db397",30820:"845c1fa7",30834:"8900c226",30837:"6b685afe",30865:"0c9e4d11",30885:"8d5884d6",30979:"683d9354",31009:"c8b95361",31013:"65360910",31023:"d1036fb2",31044:"9b98b06f",31050:"23c664e3",31068:"928e95c7",31089:"5e56d481",31116:"abe8f5f4",31152:"99496549",31187:"8793e9e6",31293:"cc976a0e",31294:"212ceae2",31441:"347c8874",31471:"2d7d2510",31512:"8a75859c",31516:"ee2f6eec",31570:"6c6d8053",31671:"ce861b37",31824:"9f850ab3",32224:"87719f86",32319:"570c64c0",32410:"09e7c68c",32446:"0eb0d7dd",32491:"6167ec10",32567:"5d8d28d6",32652:"f8c45ac9",32689:"a5461ca4",32839:"46d1dc13",32872:"f1c17b7f",32892:"e9268009",32914:"0ef4df13",33023:"9fcb81d2",33076:"a9776c25",33083:"95f7392c",33131:"ad516382",33138:"dd0c884c",33178:"cab767d9",33181:"fa17a3e5",33223:"5af48372",33260:"3deda206",33261:"82dec33c",33329:"9ebfae5b",33407:"765a551b",33725:"586fa356",33889:"5b659de8",33920:"1943e34c",34020:"9b00304e",34077:"3db5eb91",34079:"273b8e1f",34153:"4a797306",34206:"23a156eb",34293:"ff318c38",34294:"f8338e5f",34323:"e48c3912",34407:"c4a71dd9",34458:"f0f4a691",34460:"5c8ad115",34475:"5bea2473",34552:"592e779d",34590:"de061f48",34647:"c93364c6",34656:"813ebe83",34748:"71408d45",34766:"116bb944",34784:"243071a0",34792:"5c392fa5",34800:"99a27b29",34882:"16046cb7",34943:"2c06af7c",34979:"9c12417e",35038:"a2bcabb3",35069:"b269633b",35214:"3576f003",35216:"5334bf47",35387:"907c8c6a",35466:"1cf42300",35577:"09e24d74",35614:"032d72a0",35647:"a2e876c5",35768:"90bfd346",35809:"c30c381e",35874:"df463adb",35879:"41f4b8cc",36009:"b3a22aab",36312:"ade0010f",36495:"b3fdbb6a",36511:"f3d03ec8",36673:"8d4185e0",36773:"91647079",36933:"eb87086a",36935:"63849fd3",36983:"7c43c98e",37021:"6d92a4b5",37055:"ac6b62e9",37058:"6e357be7",37208:"c3a94ed1",37257:"4f4166ed",37316:"229edc10",37362:"6dfd1bfa",37426:"3d7b9a1b",37894:"7779798d",37977:"febe4bf0",38056:"5bb043f7",38104:"b34a9ee0",38230:"80b5c97d",38333:"8e018081",38368:"11414e0b",38450:"a77f15f9",38469:"66cd2d70",38504:"0df0bc38",38591:"e80537c2",38679:"5eece5ec",38768:"38cd2ebb",38792:"2f6d8a46",38819:"0cb88ec0",38873:"179d37d3",39033:"7aabbdee",39177:"3ae213b8",39209:"f55bfda4",39252:"d179e89e",39275:"c0ba661c",39325:"a072c73d",39368:"b7e5badb",39605:"22f9ccca",39645:"e6e9a3aa",39726:"f2c01e3a",39820:"8277cea1",39853:"73c3a5ed",39941:"f8904416",39972:"fa8dc2e8",39978:"5b34f9ea",40097:"f49b74d5",40158:"2d7caf96",40176:"0260d23f",40342:"7ad00ade",40365:"dd8797f2",40665:"c2ef5f99",40830:"eaaaa138",40930:"51cdab7b",40936:"f5d7fbaf",40986:"0cbb6061",41100:"ba1c1ac8",41329:"9444e723",41388:"7945275b",41537:"e6241e03",41750:"78718572",41840:"5b7c576e",41863:"56181a0b",41954:"85db7b61",41958:"e4176d9e",41998:"81192af7",42051:"a8987ce3",42054:"10c43c6e",42059:"bfe6bb1f",42169:"672c9486",42187:"4b66f540",42226:"3ebe5c8a",42263:"909a3395",42288:"24647619",42289:"48e254a2",42371:"ff4be603",42436:"4447d079",42465:"c14e35a5",42551:"a2ff0b9e",42609:"ff7c02a9",42620:"b6cfa9b7",42690:"4b481283",42721:"8e0282b7",42728:"910f748a",42757:"699b0913",42930:"608d6ba6",43037:"e2e305b4",43047:"ea09532f",43072:"dc98fcfb",43294:"f6d93f4d",43529:"87186dce",43554:"39befbbe",43635:"0c94161c",43645:"d7039a99",43697:"4b718ce0",43793:"0682e49e",43849:"8b15c55c",43919:"bfada16a",43966:"b99800de",44023:"0e0b668d",44029:"4b0528ed",44118:"5d86b3d6",44152:"f193e9f7",44174:"5b1c4ba7",44393:"dfca4314",44523:"3d99ef33",44765:"0a54392a",44797:"7e328509",44860:"55a23a94",44907:"0f014490",45057:"46f76bef",45091:"dacae080",45114:"593ffe68",45279:"01f7e848",45287:"8b1145e2",45571:"a258685b",45583:"3476fe8e",45593:"d02f7bc4",45732:"f2abaee2",45786:"239111c7",45809:"83d061ac",45878:"9ee45729",46023:"5216f17c",46045:"e0eae934",46074:"fff3ab69",46218:"8d0344ba",46284:"a5b5d55c",46328:"2fc02015",46447:"e7478c24",46838:"46dcda29",46901:"33a34e3b",47062:"cbbdf9a2",47068:"ba73f26c",47082:"cc1f5ce8",47117:"8b6445a0",47276:"524b67e3",47287:"cf1567e8",47463:"e5c3dfde",47568:"3059ed75",47582:"9ee4ebe9",47655:"ee799351",47708:"3ab425d2",47838:"2f0ee63c",47975:"497aa321",47986:"9a7b56f5",48031:"38bd3ddb",48150:"6f6b3e89",48218:"c81622cc",48320:"bf0d24cf",48426:"3ddb8349",48840:"abfd17f9",49096:"70f3cfb0",49169:"d1b82434",49241:"f99bfa77",49270:"8eed67ba",49874:"98a6ff5a",50017:"ab2e7268",50052:"8ac39bbe",50145:"4d028f11",50153:"e51da90c",50193:"486e741e",50240:"7b4c719b",50337:"acb04c32",50362:"7ce5ebd9",50375:"e86c0d05",50437:"9f305eae",50472:"40a0c599",50525:"aba6a826",50773:"7bcf009a",50849:"d7e1d518",50999:"6f93a078",51555:"e91074f3",51574:"12e76d03",51593:"6f219482",51605:"ea82a261",51625:"3b5ffa57",51706:"af6e989f",51768:"e6fe050f",51830:"42a4a45b",51840:"b63fdeeb",51945:"86a7da57",52094:"22a76d89",52126:"08ba51c1",52251:"92bceb62",52286:"79bae4c5",52491:"fcb00301",52499:"3b1e54e9",52573:"2006be57",52586:"a18114c4",52593:"28599d52",52715:"c04dcf0d",52789:"0e46f7bf",52870:"f888d9d8",53237:"1df93b7f",53243:"54230287",53371:"c55f973e",53442:"6cd64148",53675:"3ca132b1",53823:"f20f879f",54125:"b684abf7",54133:"6afbfa44",54178:"1632abda",54210:"4c8d1cae",54250:"b3c952b5",54265:"d6f7d5e2",54363:"fa5bdf0c",54382:"4bae0029",54397:"3034400c",54487:"612ebb8a",54513:"cca83a59",54591:"e0668c88",54756:"130a23fd",54778:"fd67079f",54786:"ed07f994",54794:"dd8be3b2",54855:"32f0f819",55043:"a463ff81",55216:"5560d84e",55239:"661e4fa4",55273:"08472b2d",55335:"746f419e",55478:"1710d498",55552:"8e23b856",55693:"7ec28fd9",55726:"7f536709",55745:"2e18dbc8",55821:"676a3180",55925:"407fa3a0",55962:"f2d325f1",56290:"640fe435",56424:"ac4fb807",56513:"e8d36425",56614:"71f8452f",56750:"f1525ef1",56795:"151869e3",56902:"d4a6dda9",57121:"918ae6ff",57126:"7f039048",57242:"522a40f8",57258:"1b4282d0",57293:"fb218ddd",57341:"3ad7b662",57489:"f251ab77",57598:"e4b4615d",57599:"6e586ee3",57699:"d06effa9",57749:"34660ac5",57780:"163044ef",57820:"a3c98c45",58009:"84c320c1",58042:"4893a1cb",58096:"e345afee",58182:"a045168c",58197:"09e9a7df",58234:"6145eda0",58247:"551b313a",58356:"649a76e7",58564:"ea41aad0",58768:"4f9404e5",58818:"cc6053aa",58822:"cf14af90",58824:"bf2622dd",58914:"68709c70",59051:"8938295e",59060:"de11ece8",59181:"010f8398",59191:"af049e12",59241:"07a6f1c2",59248:"8962034b",59336:"902aff6f",59342:"d959d974",59427:"b43aa387",59442:"0cd38f48",59496:"081ed9af",59506:"c2ed794e",59533:"897798e8",59592:"d243562e",59771:"e56a1a2c",59900:"619d2e79",59982:"f929d4df",59992:"918c9b38",6e4:"78f8003c",60185:"7b2e834b",60331:"34d5cc00",60434:"05a720dd",60518:"bb341369",60603:"b8677fbf",60682:"1ebc7fe2",60831:"84960677",60868:"16bb304a",61007:"a79d55be",61200:"4c13f84f",61210:"f8bc4080",61249:"f497508e",61271:"e50573ba",61361:"a529f863",61543:"e31a63b7",61643:"a882bd74",61793:"5e52bbeb",62117:"83a26c48",62235:"912fcb5a",62307:"5c77ea5f",62318:"bdd03912",62319:"8e993d66",62474:"686c1ad3",62493:"6312a106",62523:"54d1c079",62547:"bc1c33e4",62655:"8fca97e0",62867:"34d502be",62948:"c38f23a9",62983:"6dffe7c4",63105:"f29affbe",63387:"877a3c1c",63604:"7bc70741",63663:"cb582f54",63777:"92264b81",63801:"010118f9",63808:"86c8f7cd",64e3:"cacfa11d",64032:"d6990b47",64072:"555f2cec",64100:"e8591f69",64122:"07b92fc6",64485:"e715560c",64525:"6694e7e9",64533:"fd0f74aa",64596:"eee9e2f1",64656:"300bd484",64754:"e74888b9",64766:"5a7e5a43",64824:"dfd588b8",64871:"42d0afac",64900:"610e19f0",64967:"b81f3fb0",65132:"637ec626",65501:"8d3be60d",65592:"ef0f3981",65594:"88dde0bb",65641:"90dccef4",65758:"62041344",65965:"21d2296a",66026:"06b7cd3c",66081:"9d336f66",66187:"cf494ba6",66238:"1fb9ab5c",66256:"5f0246ae",66303:"a5560bad",66336:"87cc8f7c",66462:"1d642165",66465:"9c53d859",66597:"e0052e0c",66958:"12b52520",67010:"49763a0b",67061:"d7124adb",67069:"c7b80b67",67132:"15f4efbb",67343:"2d8700b9",67448:"85ac525a",67583:"18caf9ef",67597:"1693c0b8",67638:"a48eac25",67670:"a23744f9",67908:"d29db0e3",67954:"8d96489a",68034:"8858d0ce",68126:"db9653b1",68258:"f301134a",68689:"8751004c",68757:"5335ef4f",68793:"9b89ba00",68823:"c3ca7a6a",68943:"ae5838f0",69111:"1a54bfd0",69125:"a5f4c814",69209:"193f200e",69234:"212137e2",69254:"140e6a69",69628:"9bfbb8bc",69629:"d5d7628b",69647:"607292ce",69843:"7facae8f",69899:"6d933e1d",70081:"41e02281",70130:"16b47049",70178:"06b3b671",70249:"7bd49e6c",70277:"d72ada40",70367:"1fc4ed50",70504:"f53e2381",70543:"ce79b72a",70614:"65306ecf",70706:"21d3c1c7",70964:"06876062",71081:"99c371aa",71160:"dff7b4e8",71169:"5ed92a05",71287:"167f5be9",71476:"c26ab7d5",71516:"dcf1d6e7",71544:"d93a0aad",71698:"ff9d88b6",71789:"fcfa677e",71811:"39afc900",72070:"59dfcfb5",72189:"7f31124b",72331:"66179fb5",72500:"cf945ce5",72613:"084a18af",72638:"1adeac4a",72740:"150a4d14",72887:"2e2a73ec",72952:"d703ca6f",72957:"d9b3adf3",72978:"2e6d047c",73238:"1eec97be",73300:"4efa0483",73338:"aa02927d",73356:"b26f6fa9",73369:"6594bd70",73452:"63b8176f",73537:"c7953305",73618:"12dcfbad",73725:"8f9c5733",73745:"154cbeb4",73766:"af29c71b",73975:"566ea6d5",74019:"ff35d8ff",74061:"a0541488",74091:"67e63bc0",74132:"ba454016",74136:"ef4e0f5d",74139:"38341509",74332:"769f97b7",74362:"71078103",74441:"4c8fc79c",74465:"e6a17fa0",74480:"4bbc58d4",74578:"80ea5ae7",74794:"016f0e16",74839:"8b87f6f5",75070:"5b34d9eb",75118:"29ff1658",75123:"8d81369e",75203:"61c61e17",75273:"ca1d44bc",75546:"c176dc4f",75567:"43a232e9",75671:"ff078e30",75702:"1ac49947",75897:"64e30bbc",75911:"b92bff04",76029:"d6f3938e",76180:"172c9869",76222:"99ae7254",76240:"02de7b5a",76355:"2363ed29",76360:"4f8fd4be",76369:"c0f7075f",76527:"359e34b0",76793:"d12dbf4d",76878:"198182f0",76895:"8d493a07",76924:"cb870251",77053:"9eb4c1b4",77170:"60e8b504",77413:"4deae4de",77427:"78dc40c2",77527:"56c932ee",77655:"274eaedf",77680:"60043c0d",77887:"be698a2c",77923:"971cbe2f",78035:"1ae50e88",78063:"c80936bd",78072:"ed51eb7d",78114:"516dec85",78233:"41742cda",78248:"df22f3af",78452:"118229e6",78863:"2b5e4b34",78924:"0fcbeed9",78991:"69f3d9b5",79073:"a17fb62b",79164:"95f18dd4",79290:"f83967c4",79298:"10c28d6f",79356:"73a7bd5f",79478:"df5a3016",79479:"27e1a14b",79543:"fbff3b11",79691:"c733e485",79829:"6b2b8280",79895:"4bedd8c5",79958:"b00a2879",79963:"092519d2",80053:"935f2afb",80132:"18dd253f",80192:"fc8aebe3",80268:"eac8f2ef",80380:"2d35b91c",80709:"83cd8f20",80895:"f97cc188",80940:"abf6a1f1",81135:"0c1ee94a",81477:"2dd65ece",81667:"a39041db",81708:"82a4f002",81835:"6a0b4355",81934:"35fa8025",82132:"919b108c",82250:"51da09c7",82342:"dc130668",82348:"3f6554cb",82360:"d885d629",82423:"fadcaea6",82513:"7bcf7096",82614:"aa395a59",82621:"ba4efbe0",82864:"3ebee193",82982:"bf02c3ce",82989:"39b565ff",83069:"39c8ecdc",83074:"1a42aba3",83147:"a26f7afa",83692:"779753bc",83893:"66716ec1",83957:"10f908b7",84063:"363318d5",84097:"737371dd",84242:"6e366b57",84362:"8da7304d",84477:"3b12bc8a",84500:"6eeb04e2",84513:"6cb122e3",84710:"4cec253a",84745:"42325f5c",84754:"211f58b1",84841:"7e5ee96c",84847:"5c27dd68",84854:"34b19815",84888:"e9d5739e",84941:"21bf64ca",85011:"6e5d074b",85054:"7bc3feb7",85098:"60d04b47",85217:"f8482b2c",85419:"3743f01c",85455:"7e446cc1",85493:"6d480200",85780:"82033eb7",86009:"c9b79676",86018:"ed809cac",86129:"b47406fa",86150:"1db21d86",86333:"08e3aaa9",86356:"4ad39569",86476:"45aa7127",86518:"ce66b6fd",86826:"7dd3be25",86950:"4e1da517",87064:"a6a8af40",87223:"27bd5328",87224:"6fc8d865",87240:"1fdab62e",87304:"9980f90c",87313:"96c0bb00",87316:"a48778d9",87443:"96ec050b",87460:"7668acae",87482:"8faa0fb1",87513:"a291f403",87634:"f807eec9",87667:"c07f2717",87799:"3958a146",87836:"e111f111",87866:"b1998bb1",88179:"e5842021",88187:"1e391540",88252:"c0074ddd",88295:"21ad5224",88338:"e1b9986a",88380:"ac930f6e",88446:"3c725018",88598:"6827856d",88621:"4499569c",88625:"6f59957c",88821:"41db9914",88831:"1c56d006",88879:"32ea4ecb",89002:"4455e85b",89210:"b4028749",89574:"13b69fa8",89780:"46c600d5",89806:"443045da",89852:"1f79049f",89986:"fee1f25c",89987:"d25ffd5f",90046:"d043cc46",90185:"41b3e733",90333:"f2497893",90342:"2c91f584",90392:"f60e43ec",90398:"0b78393d",90431:"dd313590",90451:"0a13c98e",90464:"b2335bc1",90536:"73dfc993",90560:"01fb8b11",90601:"22f40a40",90610:"147b0f6a",90615:"8cd0f4f5",90645:"6601f604",90666:"459a783a",90865:"87e7806e",90896:"4302562a",90976:"6eb0ce42",91178:"9c4bbfc4",91213:"ff0539a2",91231:"28b27838",91274:"76ace0dc",91287:"02ee0502",91304:"872e63de",91406:"4cd7d8af",91425:"1671b3fa",91523:"d692bb25",91628:"c839a5b0",91753:"2f535455",91782:"304ed800",91849:"b2735041",92085:"000c061a",92244:"8c828746",92269:"9c42de85",92393:"db5c8692",92404:"799b872c",92456:"d6360c39",92463:"4d4093bb",92744:"6eebf72d",92775:"7861f6df",92778:"8c31caf6",92786:"ae5bb339",92843:"85c3ba36",92851:"8bfba65b",92964:"14e00221",93023:"b984322c",93071:"61e5c5b8",93151:"e7cbe8da",93176:"f7101d4f",93195:"740eb29c",93308:"b83df1bc",93340:"5d075efb",93346:"f7735fb0",93377:"dd435828",93400:"03e8549c",93590:"dede40b0",93749:"4e6907d6",93832:"917734f8",93837:"cb341380",94114:"c9aea766",94123:"7c8407dd",94136:"91dc98f0",94197:"37aba5d3",94223:"43b891d1",94328:"63f66cb7",94337:"9fdf7324",94401:"6c10648f",94452:"878356ab",94605:"487f7f30",94694:"d3e690ce",94696:"376d31f7",94932:"a233fb97",95020:"b8e39b95",95107:"d666ab7e",95171:"3db8c88b",95281:"bc08bf79",95296:"9936b6c8",95317:"cf282674",95327:"1e173bbe",95329:"5b23c695",95364:"41fbfe2f",95418:"7877b0eb",95441:"e9ef6b31",95561:"0e0f5dd2",95696:"8462ad7a",95745:"edf19300",95801:"e490fd18",95911:"7e254f9d",95945:"90b0cf6d",96055:"8fa500ae",96078:"d6011437",96082:"a322018d",96135:"3061ad92",96188:"f0129862",96361:"ebf2bdda",96426:"64bd79cb",96535:"38e65fdd",96544:"49ea6ca5",96547:"385bc71d",96617:"e23cd647",96684:"a612420b",96772:"b35418cf",96831:"99ba663e",96945:"09e11ac0",96971:"57973c2b",96979:"7f6f8f16",97065:"6816f4c0",97129:"f3034cf4",97334:"9d4bcb9a",97469:"d91e7ab4",97523:"02fbc840",97547:"902fdb3b",97553:"7ea214d5",97617:"ed97cef0",97782:"b094b997",97816:"7513b789",97826:"16cff1eb",97850:"dd6685df",97920:"1a4e3797",97955:"746bf890",98177:"049dc708",98200:"0e7f2915",98218:"1820eb3b",98272:"b7f629d0",98623:"ced65f67",98740:"d1475ab1",98791:"1a6f209f",98868:"6a913ab1",98939:"3ff950a4",99120:"008b0ccc",99184:"8aecb2ef",99266:"ca443c18",99367:"00125b11",99389:"c2f4aca4",99427:"64758f43",99494:"f2d5637b",99607:"49ea4a42",99669:"32db5af4",99839:"15d4dc80",99871:"5e3def70",99997:"b63b5bb9"}[e]||e)+"."+{19:"73207e89",21:"a3309319",71:"148bf97d",99:"3f2b222d",172:"c102a782",224:"487be67f",250:"e9efb2fd",291:"b2f7c218",467:"3701a1f3",513:"531e85fa",596:"7b2cab9f",803:"9d68a6cc",899:"26bbe5e9",1116:"56b8b25b",1365:"573cbaf6",1387:"23980d09",1523:"ec2891cc",1647:"63a2d108",1652:"94113f44",1664:"180295d2",1671:"62a8f5fd",1940:"3261f2e9",2044:"d2700165",2097:"7188f7ec",2196:"3febcd57",2312:"093e5909",2427:"f701c31d",2638:"118a41f9",2656:"37531cd2",2905:"76c11c59",2916:"2ecd567c",2989:"cf806e65",3044:"712f76ea",3102:"bf11a1b5",3145:"b6e9e373",3191:"74dd2862",3197:"9da021ce",3216:"eacb9804",3283:"cd4afe73",3326:"58beb1bf",3397:"b0ae73af",3398:"63275511",3650:"2e235066",3667:"1f8a4cb5",3914:"54e8dd0f",3919:"41dcf0b8",4125:"96f2442f",4151:"1a6afc47",4195:"7402cd75",4244:"8675cdba",4328:"e0ebdf09",4504:"025ef68d",4513:"558b14b1",4585:"05c4d67b",4631:"1d37eb2a",4874:"39750b03",4882:"d87f6d94",4929:"18e132f8",4972:"a7243668",5061:"509f2053",5129:"479f5110",5313:"5f8b4c43",5352:"578d6913",5383:"306e0d1d",5512:"8f6a2d54",5714:"ad1f0a21",5909:"51facc0d",5920:"8c8aae04",5981:"efaedd7a",6027:"6f5c789e",6151:"7a76de9c",6386:"67f2de5f",6443:"14902fad",6517:"34dee69c",6537:"ba99c5c9",6553:"08568a59",6695:"cfee6a07",6734:"6772bb12",6799:"67367b32",6822:"1511eca7",6824:"6e7a03e3",6968:"a44bcc6e",6971:"873ad005",6978:"6c523e96",7078:"521e19d7",7091:"8c3d2fe2",7092:"cc64c0ff",7108:"d17c6119",7120:"d18d3e66",7155:"f52fceb3",7162:"cf567055",7318:"3835ac08",7451:"334acc4c",7485:"7b41d7f5",7500:"099ef3cd",7874:"7c1cf1bb",8023:"897a9b52",8135:"672f0bde",8145:"5d99a1dd",8188:"3b884b4b",8210:"d7b2d51a",8313:"5d208e75",8328:"d30eb04f",8407:"28cf1826",8482:"cceadeb0",8638:"c9ba8e41",8671:"b4177ac9",8809:"959e39c4",8882:"0bf188b0",8906:"cfd44206",9028:"9d1978ac",9119:"fd788116",9225:"a93f8834",9235:"b22f9f6d",9365:"8b245b69",9444:"ca6f47e0",9542:"4f4a1a23",9550:"83ecb96d",9615:"54fc882e",9817:"83bf0cd5",9836:"c72917a2",9907:"676fbeba",9947:"5cfa1c77",10109:"beb060f2",10228:"aa539b8b",10270:"8ae752b8",10436:"1ec22cec",10497:"809aee5f",10650:"547a5e75",10918:"ef85344b",10987:"c12ef65c",11174:"2c33f6da",11203:"9539d62e",11311:"f40cd0a5",11321:"cd4efea5",11326:"6cbc304c",11342:"2bccdb3b",11398:"f8e68ae8",11656:"ac13cb1c",11875:"d1539320",12228:"d521a8eb",12442:"75a11235",12549:"16f88776",12555:"900d6f87",12558:"0ac14857",12560:"9bb2eb9d",12567:"7d910920",13253:"0ef1443d",13280:"29f73853",13351:"804b2952",13460:"9e91d59d",13588:"d0b6d6aa",13595:"6c1f4a63",13617:"ea5a2c00",13718:"94e8a0fc",13896:"76d63f5f",13924:"ba0bd4a2",13979:"aad133c6",13995:"243e677b",14061:"6004bdb7",14088:"6e4abf52",14095:"18927dce",14143:"70c2608c",14299:"f4efc87a",14369:"b25e3d4c",14386:"acd52792",14396:"fd9bfcdc",14549:"607a549f",14610:"a9ad2a64",14670:"63c0f96a",14713:"8c81902c",14840:"4ae7e4f2",14908:"954b802a",15196:"9dd233ed",15497:"2c78c17f",15658:"e22235f9",15888:"50a93f8c",15970:"360efba8",15994:"9794e987",16022:"c86edca7",16038:"8f228690",16058:"f0b25bfc",16071:"0df54331",16153:"4800d10d",16161:"3a69d696",16379:"38089655",16528:"f38ed133",16635:"e969de7c",16672:"38ba7418",16685:"3022befc",16876:"19436136",16973:"9fbbd5c9",17275:"b3f920a9",17283:"b641bac3",17457:"ab4ccae6",17511:"fdf99bc4",17726:"fc4837d4",17757:"10a7c58d",17785:"172166c7",17883:"3f659ac3",17887:"fbdcba1d",17989:"52e3fc0e",18025:"8e9620ce",18050:"5b8280aa",18084:"fcd7fdb2",18100:"0a6d52c3",18143:"b09cd8a9",18156:"a975996f",18186:"df8b47fc",18318:"edf202fa",18559:"5c93aa35",18734:"5dd15d0b",18746:"f9ac8609",18883:"90cc608f",18892:"6c8911a8",18894:"74b1ce85",18928:"0701e03d",18998:"d6cefe2f",19177:"f4fb3a86",19204:"25b579ad",19212:"c6205a58",19305:"bf07439c",19408:"893bf9b0",19427:"13389c1c",19493:"0990c5c4",19504:"3cbf15b2",19531:"795dc04c",19625:"acfca89a",19671:"e0c673af",19709:"eaec2d23",19733:"3acc99c9",19806:"837a7ae1",19832:"36958835",19876:"a944f10f",19939:"123b41cb",19962:"a3ecf956",20040:"1320b0ce",20061:"fdab8ea6",20169:"f30c5c13",20303:"ac64adc9",20602:"d2e8db2d",20689:"f0ff8154",20707:"9011dfb7",20764:"705b6a69",20911:"eb39e4b7",20917:"9c9a3e5c",20983:"f78047ac",21015:"1e986630",21134:"a1896a0f",21143:"16349e64",21190:"580de28d",21207:"8e9578f8",21228:"ad3ad6e5",21379:"80aa9c55",21643:"eb35f457",21688:"795caae7",21823:"35e78f99",21983:"a4e4572b",22030:"c38a2655",22087:"817ffdcc",22129:"267916df",22163:"f5657f46",22238:"bdfbafdb",22456:"62957769",22523:"15391619",22540:"57cb9539",22583:"b8adcfe8",22604:"4c410f1b",22777:"991b45b1",22898:"c8aecb21",22940:"873908b0",22997:"f3a4a591",23064:"03b7ec0b",23228:"c0599384",23231:"295984d8",23252:"94c1e97b",23310:"c407c53a",23320:"3c9b69f0",23343:"08e5a4d6",23435:"59082b53",23522:"dcfb4085",23536:"2a58bbac",23545:"4623b3a1",23663:"33ee14f2",23714:"43716ebe",23804:"66d68fe3",23898:"ec519008",24058:"07462b4e",24066:"9d4d9ce3",24101:"d3e3013e",24109:"795d5349",24158:"a91f4481",24266:"0b540723",24282:"d3ef7720",24401:"1ae158f2",24467:"dc4c3279",24501:"178a453f",24946:"8d83115f",24986:"ba127d73",25079:"1dab7340",25251:"3cee59a7",25283:"06e3d89c",25427:"854a38e7",25433:"c48bf181",25451:"a23a897f",25513:"3f77f081",25547:"284c9b9e",25579:"9e7055ec",25833:"71a40566",25898:"94b4215a",26067:"7b3970ce",26084:"861dcdd5",26086:"12738d95",26201:"75d8825c",26291:"bd84899d",26311:"ce5c5ebb",26521:"89b58f07",26654:"4d65993e",26686:"9da74581",26695:"49306f14",26858:"1ce10981",27109:"fbcb735e",27167:"eb8d133f",27270:"02e3dd33",27276:"2fd74dbb",27303:"46258253",27324:"04a7ca69",27554:"c429cc73",27704:"849e54d9",27918:"ca462563",27982:"0533c8f0",28085:"d5cffe43",28134:"3e2ffbbe",28139:"80df3532",28219:"51e8e461",28261:"f279f4e5",28367:"dc6ae3d7",28475:"2000a841",28476:"892e8462",28514:"2d31535a",28516:"d3f4d479",28623:"760e1770",28699:"c9753e68",28800:"7ebb42b4",28880:"777c9b40",28882:"79f11e9e",28906:"63cdbd64",28922:"fa03019f",29014:"63a3f3cc",29025:"caedbded",29050:"15e17037",29066:"92b199e5",29131:"61e3e7a5",29191:"6765a974",29272:"a7da1cef",29514:"902f2c64",29520:"59707016",29698:"4ac96687",29717:"c3facf77",29782:"8ca83049",29818:"be78b6d0",29831:"c421c31a",29864:"7e0679a3",29871:"8a4a1409",29886:"da3cf2c4",29899:"8cb1ad4a",29978:"f29be154",29980:"15805725",30062:"2dbf55d1",30216:"c844cada",30295:"54944412",30419:"be694780",30433:"db6c199c",30454:"5264f7a4",30470:"0ae2450e",30589:"3a397208",30677:"b72f0627",30678:"2a52c41d",30800:"9375989b",30820:"b28f98b9",30834:"ed2abcff",30837:"8f11c961",30865:"8b9d510d",30885:"e871b509",30979:"589a7d3a",31009:"41ecc7f9",31013:"7a5f9581",31023:"970d7bca",31044:"732db84c",31050:"400af1a1",31068:"1f0b2373",31089:"ec193a0e",31116:"bc1bd6c9",31152:"a086d363",31187:"52f3a337",31293:"17ff3286",31294:"3c2a361c",31441:"c659a961",31471:"d9c87e09",31512:"69ffbcf7",31516:"6f3edbc7",31570:"a9bef6dd",31671:"4d0bd185",31824:"22f60a1f",32224:"19c9db8f",32319:"63655ae9",32410:"f2893ec5",32446:"8ce02657",32491:"bdd9498f",32567:"277cb195",32652:"72910332",32689:"08107660",32839:"5c79dc2a",32872:"855c14fa",32892:"250cb5b2",32914:"70e38801",33023:"2964b050",33076:"ef5beb95",33083:"16e34a1a",33131:"a67a7052",33138:"cbc82116",33178:"555e80f9",33181:"2310c781",33223:"e5f8838d",33260:"5ee9d0c7",33261:"a2e46c4f",33329:"94ab58ef",33407:"3ff66407",33725:"6470c099",33889:"1f400f9d",33920:"32836d68",34020:"309d55c2",34077:"652a00df",34079:"25c81e5a",34153:"f309a901",34206:"8ef010f8",34293:"a7e2d1af",34294:"6e8ac714",34323:"ef923fe8",34407:"f08cd09f",34458:"bedb0cad",34460:"51defee4",34475:"994398a4",34552:"278830b5",34590:"e79f245c",34647:"cb920ca6",34656:"53a0d9e7",34748:"c74caba2",34766:"9716c156",34784:"121bb89d",34792:"13e2220d",34800:"31234350",34882:"53c961aa",34943:"5a2f2d6e",34979:"be9c4116",35038:"29817a1c",35069:"05c8b29d",35214:"3be2021d",35216:"783d15db",35387:"d6c4b7cd",35466:"b8c66a97",35577:"eef0d34f",35614:"0550e592",35647:"8df42e8b",35768:"fa150a9f",35809:"29c1c1a6",35874:"64598624",35879:"603ed18f",36009:"ed99bcf4",36312:"f6211aac",36495:"c4a16cab",36511:"aa66c640",36673:"9639c6ef",36773:"364a602a",36933:"6c6d7692",36935:"232400dd",36983:"87a7744c",37021:"8953fbe7",37055:"8a714c7c",37058:"583e5f2f",37208:"4babdc40",37257:"7b25eb85",37316:"136d87ad",37362:"af7565f6",37426:"a3fce28a",37894:"1d31c5b3",37977:"a605632a",38056:"c9cc2c03",38104:"70b4c07e",38230:"14de8f42",38333:"6d30319e",38368:"99e33615",38450:"971f211e",38469:"c905d16c",38504:"b575cc51",38591:"8b436f7f",38679:"2924f701",38768:"e49628aa",38792:"873b0b4e",38819:"d5786d3c",38873:"aa2dff10",39033:"72f28f6d",39177:"e72fee4a",39209:"ab100076",39252:"a8e9c58b",39275:"68258924",39325:"9e574bbc",39368:"9b3b00b6",39605:"87f4261f",39645:"363e983b",39726:"e601e6d1",39820:"efc15edf",39853:"2eceed3b",39941:"09b3269e",39972:"1af66b9e",39978:"3914825c",40097:"065399f9",40158:"c5447ab8",40176:"a0efce43",40342:"6b02e5f3",40365:"943c9bb3",40665:"43c72f99",40830:"28cfd22e",40930:"2bddabd7",40936:"6f445b64",40986:"f6358136",41100:"59aeb5f1",41329:"327337f0",41388:"15946aa8",41537:"507f5136",41750:"dfbc322c",41840:"319bb3a8",41863:"397dd98b",41954:"7942c49d",41958:"83b83b97",41998:"a08698a0",42051:"e3431432",42054:"9a68e985",42059:"dee04e82",42169:"d7053385",42187:"2cada35c",42226:"85755b59",42263:"1b5d9df4",42288:"5b62f81a",42289:"0c3f570f",42371:"d7fc9caa",42436:"1827f217",42465:"cf7195c1",42551:"e6eb7da2",42609:"0fc5596c",42620:"f4c7af3a",42690:"c8225ded",42721:"697fbb16",42728:"2d6aacf6",42757:"a4a33845",42930:"4a3d4ba3",43037:"f8316728",43047:"4998e689",43072:"2e96fcd7",43294:"0b488eb3",43529:"f1d99b35",43554:"9b15f63b",43635:"382a5fae",43645:"e2041df8",43697:"5f5f48af",43793:"b242fd59",43849:"3d340240",43919:"60913365",43966:"f807cefc",44023:"d8d2c9f3",44029:"a29133e6",44118:"1bc0c1f6",44152:"93b2a9bb",44174:"5bbd8c7c",44393:"619828bb",44523:"c96ddcbc",44765:"404d2c80",44797:"bc343266",44860:"83bf1478",44907:"b7e881e3",45057:"026ec45c",45091:"f47baa55",45114:"c5711e84",45279:"b7f604ea",45287:"7ea5080a",45571:"b3f438b3",45583:"c9734003",45593:"fa50b001",45732:"646f2c6f",45786:"02f6764b",45809:"a28b7ade",45878:"2405e6ca",46023:"02d71bae",46045:"7c8d179b",46074:"85dc8255",46218:"304359f2",46284:"35783c6a",46328:"e6ad5407",46447:"4229ab59",46838:"615dfccd",46901:"73c5e427",46945:"aca29914",47062:"ecb4b558",47068:"e5db7558",47082:"87413ddb",47117:"925c7a4a",47276:"414f3077",47287:"17c798ef",47463:"ffbf05d7",47568:"2216e773",47582:"98df77d3",47655:"741c773e",47708:"28dd4c52",47838:"1eda5c96",47975:"1d9b9deb",47986:"768f43ef",48031:"62017355",48150:"e745121e",48218:"32acfb7b",48320:"0781165c",48426:"e645d804",48840:"9394dd4f",49096:"bf964682",49169:"7d6d8f24",49241:"a8e10b10",49270:"cdfcac6d",49874:"1c08aa98",50017:"8cb0b147",50052:"32a7f158",50145:"cf12aec2",50153:"9c0d9698",50193:"cd9074e9",50240:"e5ce9a3f",50337:"e1095412",50362:"1653c490",50375:"e03acd0c",50437:"afd1bfa0",50472:"8d82b204",50525:"8ffca8d0",50773:"93035d26",50849:"afc6073d",50999:"7814621e",51555:"8c0ea918",51574:"e553142d",51593:"ccb11905",51605:"0604c821",51625:"9af5a24d",51706:"5c9852e1",51768:"4856aecb",51830:"3c636cdb",51840:"6e2f046c",51945:"084ffea8",52094:"b014c67f",52126:"5ffae957",52251:"71f8dc72",52286:"dfe5dadf",52491:"3832c13f",52499:"3e054d23",52573:"006068eb",52586:"fc99d991",52593:"93febf0e",52715:"246739de",52789:"4e836d21",52870:"58b181cd",53237:"c3a4514f",53243:"5c3c3aa5",53371:"b01e4c10",53442:"ddb338e3",53675:"bd70cb8a",53823:"6530507c",54125:"2f1c7fe0",54133:"45c8fdf2",54178:"89e6c31a",54210:"0ff7d73b",54250:"f3a66d07",54265:"4eb7ccff",54363:"f05db08e",54382:"226534bb",54397:"95ca43ad",54487:"96d9e1eb",54513:"952be9ba",54591:"4c7ab366",54756:"354bdb25",54778:"ab81ffc8",54786:"ad58d36f",54794:"daac75f3",54855:"4f0b894e",55043:"923df72d",55216:"dff3209e",55239:"52636bf0",55273:"27e02213",55335:"40de3b68",55478:"003e195a",55552:"c5b6f07a",55693:"a959df65",55726:"8042dc4a",55745:"2cebe784",55821:"6a3913bc",55925:"5005937c",55962:"40e7c1dc",56290:"d30a97c4",56424:"1a06e672",56513:"5f155b72",56614:"f2830c22",56750:"186f7120",56795:"017a94ef",56902:"f91db1cd",57121:"e7f833f5",57126:"704067cd",57242:"c1396e1a",57258:"3aaaa2c9",57293:"3721a264",57341:"2aa6199b",57489:"3d3e42fc",57598:"7a15184a",57599:"a8e91558",57699:"77e23c32",57749:"aa83a7f5",57780:"44a93095",57820:"d9fe0245",58009:"c5f9c347",58042:"9798f0fe",58096:"637b08bd",58182:"b13442de",58197:"1d43d81a",58234:"d3624c41",58247:"47cd75a6",58356:"1aea63a9",58564:"fc408722",58768:"41323c68",58818:"9671e908",58822:"8f58955f",58824:"3ea8fad3",58914:"796c938e",59051:"ce30c81d",59060:"f4bb9845",59181:"d2bbdf03",59191:"4f95915c",59241:"126c9fc5",59248:"8c5d9aae",59336:"1face31a",59342:"2c3b6d28",59427:"9a0b349f",59442:"d5c2c74b",59496:"0c397f5a",59506:"4a81177d",59533:"915071cd",59592:"b7b4e63c",59771:"e2a99bf6",59900:"1e9304e5",59982:"c299fce0",59992:"70d0be55",6e4:"cc9f4527",60185:"babf0d59",60331:"924ba5f7",60434:"52042dd4",60518:"c0f1e5e8",60603:"5316c07a",60682:"7854ea33",60831:"08063f83",60868:"267fe23b",61007:"dcc4e5af",61200:"6c5f5e8b",61210:"d73ed0a2",61249:"09ef4526",61271:"fdc83c0b",61361:"636b72a8",61426:"be2e971d",61543:"8f27603a",61643:"f9dbfec5",61793:"01718e8f",62117:"a30c0b4b",62235:"58285753",62307:"5a3ba620",62318:"9a886142",62319:"b75eb48b",62474:"dc194af4",62493:"bbbbd1d9",62523:"6baf1734",62547:"fc411513",62655:"f09d9004",62867:"46a98c28",62948:"dbf70f88",62983:"61d5d965",63105:"096987cc",63387:"02f1203e",63604:"66965e35",63663:"c51c3a56",63777:"0219cda6",63801:"70a50fe6",63808:"379d8af0",64e3:"c520b0e8",64032:"b5c596b3",64072:"f3f20053",64100:"7d16a1fd",64122:"3f535a37",64485:"772190e7",64525:"fde13b5a",64533:"dbde94ef",64596:"7e071618",64656:"5c661dee",64754:"1962a975",64766:"9feb1ebe",64824:"68d4a423",64871:"0c878ea9",64900:"9c15d13e",64967:"8c69c566",65132:"c728a280",65501:"2405319f",65592:"a5b39398",65594:"30989c6f",65641:"3548b448",65758:"5f737d83",65965:"fca751f4",66026:"e4206534",66081:"0b35a4d3",66187:"6bc78a22",66238:"c7bfdb48",66256:"441bc284",66303:"7097325b",66336:"7ee25827",66462:"933f4a86",66465:"711a6a15",66597:"abbb3946",66958:"d3834829",67010:"186c6180",67061:"94f1459a",67069:"a14d8963",67132:"34a01d6f",67343:"0929d0ad",67448:"c3192226",67583:"7dea7c29",67597:"c925248d",67638:"fb736299",67670:"cb8c9278",67908:"7befb180",67954:"4b4df4be",68034:"673d4259",68126:"ab783f9c",68258:"fa76b63f",68689:"3a807ecc",68757:"4b7d9782",68793:"af3c15ab",68823:"5c7b8d2a",68943:"f7a596db",69111:"6f76f4e0",69125:"6052e105",69209:"9e539e38",69234:"6337130f",69254:"270b7bf2",69628:"7988be09",69629:"6d3ab36e",69647:"c5c936f9",69843:"a06150bd",69899:"576d10d9",70081:"47ddcfbd",70130:"be0e7aec",70178:"81e6c33b",70249:"b993646f",70277:"0316f6cf",70367:"1d4d5424",70504:"28a803b5",70543:"fe6b6e6e",70614:"4f9da953",70706:"73960b42",70964:"bc48267c",71081:"31a6d953",71160:"4dddf998",71169:"a0975002",71287:"885b618b",71476:"0485f0c1",71516:"b09b1ddd",71544:"3855ce91",71698:"bc2f9371",71789:"59ccffdc",71811:"264139bc",72070:"7ed11917",72189:"5bbcc325",72331:"6d29448b",72500:"f64f66be",72613:"c769eb3e",72638:"fdb543fa",72740:"556dfa23",72887:"4cdbf544",72952:"e3a8eab8",72957:"960770e3",72978:"aa7363f2",73238:"d2a2d1e9",73300:"e20e9d0a",73338:"04155867",73356:"36d8b7e0",73369:"f8653575",73452:"ed4f95c5",73537:"3e09c354",73618:"f611989b",73725:"5a69a073",73745:"593cd9c9",73766:"8ddc1af8",73975:"a1d7f56e",74019:"eab93d36",74061:"47eacce0",74091:"b78b34ed",74132:"a5970e5e",74136:"f2226b77",74139:"0bcd285c",74332:"0f66d626",74362:"ffd5aac5",74441:"0ef242c1",74465:"3b3b7dbd",74480:"25f1369f",74578:"178c32cd",74794:"8f102688",74839:"577811de",75070:"22d722a3",75118:"bc928be3",75123:"8b8b6076",75203:"8b7d980c",75273:"b16cff36",75546:"49f94163",75567:"355318aa",75671:"5943eb1e",75702:"dfcf6ff3",75897:"b7955be2",75911:"02fe2b4d",76029:"a35fff25",76180:"059fca9c",76222:"22652783",76240:"6c6980d9",76355:"b62d2caf",76360:"8393a254",76369:"9e396c17",76527:"50bbd3b6",76793:"a673c81a",76878:"575a3510",76895:"57085844",76924:"bc5962fb",77053:"16eba6b8",77170:"e863ae3b",77413:"7e5de04f",77427:"add12e61",77527:"78b8c9e0",77655:"0003883b",77680:"595d9465",77887:"088729d7",77923:"fb735e37",78035:"4ead3dce",78063:"5324c9d3",78072:"d4105f8b",78114:"34cd7f60",78233:"2e83e610",78248:"9c0f4314",78452:"8e3ff138",78863:"c85a658c",78924:"f65e527e",78991:"52cf8db2",79073:"9b6a9356",79164:"2bf3dad1",79290:"0c519db4",79298:"4a5cabcd",79356:"37e00d95",79478:"36233afd",79479:"24435308",79543:"a1c76445",79691:"abf6490b",79829:"d0e80522",79895:"85d250b0",79958:"aeb03179",79963:"420ea460",80053:"fe5e4da9",80132:"2e07a3f4",80192:"44c46920",80268:"729162ff",80380:"8a522d91",80709:"fd31ec19",80895:"2b1b02ce",80940:"34e77834",81135:"a0f32953",81477:"81ce2eb6",81667:"b8b8cdb0",81708:"4d7d7762",81835:"efcb7b56",81934:"33bbfee1",82132:"14c8bdbb",82250:"ca3680fe",82342:"74cd685f",82348:"712fccce",82360:"143d911e",82423:"7b71feb1",82513:"4f3b2b51",82614:"2a95d08e",82621:"dfcb91e5",82864:"32d19c2f",82982:"a2616da8",82989:"6a4597e6",83069:"47b3ad61",83074:"c97db8cb",83147:"888c5d69",83692:"db7a78e4",83893:"3d2b86a2",83957:"1b9c82e0",84063:"1a3a757d",84097:"6ef8d9da",84242:"c9097d63",84362:"c6417e92",84477:"d8bfc341",84500:"431c6477",84513:"b6949bb2",84710:"de128b25",84745:"b52aeec7",84754:"dee9fa1c",84841:"93e39080",84847:"215dbaa3",84854:"84dae178",84888:"9d831185",84941:"b4c8a6cc",85011:"2f831042",85054:"adf0601d",85098:"279d70d3",85217:"f61e93e0",85419:"46bf7559",85455:"83eea1ee",85493:"119281a5",85780:"b9f5f272",86009:"70baefe3",86018:"31bdd1d7",86129:"a2e712a4",86150:"cc7f840f",86333:"ea59a642",86356:"dde2e98e",86476:"197674e4",86518:"9284ae71",86826:"df69b0fb",86950:"e65ab699",87064:"3cca1274",87223:"deedbe22",87224:"f3e47b26",87240:"58b186a9",87304:"886ecc13",87313:"c8286e0c",87316:"b4e193d1",87443:"9aa85498",87460:"e647bbd4",87482:"df943648",87513:"b2501ac9",87634:"413486e3",87667:"1173ba50",87799:"ce5bac8d",87836:"b8d9a0fe",87866:"7f1c6977",88179:"0286c49a",88187:"138206fb",88252:"6ed6f96c",88295:"58531032",88338:"44d418f3",88380:"8c2767c8",88446:"bedc1525",88598:"44737f48",88621:"66870438",88625:"fd93b3b7",88821:"a3cc23cf",88831:"4d0404d0",88879:"bc39a8d7",89002:"a4594737",89210:"bbe355e5",89574:"a872329f",89780:"cb004132",89806:"8290698b",89852:"0bf03005",89986:"25e5550e",89987:"7ef3f99e",90046:"18bf9d1a",90185:"9e4a08ea",90333:"50f0aaba",90342:"f21131d2",90392:"bfd01257",90398:"52310a46",90431:"af8fe2a5",90451:"9f655912",90464:"b5c9216c",90536:"9d746e67",90560:"490d2235",90601:"e04386b3",90610:"bf6a18e1",90615:"5eb23ee3",90645:"ec412e06",90666:"a4c267df",90865:"988a7db1",90896:"464b25cb",90976:"3c307ae3",91178:"ae626034",91213:"1668605a",91231:"20e516c9",91274:"89828228",91287:"2cac701a",91304:"51bac317",91406:"7f27e2ec",91425:"6224592c",91523:"da34a7fe",91628:"5022e7cf",91753:"c5461936",91782:"d494ecff",91849:"c19b88cf",92085:"da4ef155",92244:"b2497d07",92269:"54c0a36f",92393:"b2bf88aa",92404:"eb13da2d",92456:"90063d24",92463:"3c0a582c",92744:"d29698c0",92775:"6cc03df9",92778:"3df5cdc2",92786:"5e68e413",92843:"ae295bd7",92851:"d6f651a2",92964:"b6045191",93023:"67e73b3f",93071:"060ab57a",93151:"ee800142",93176:"a74fd090",93195:"e70b51ce",93308:"1467e450",93340:"11edc4dc",93346:"1b4782c6",93377:"d70f55df",93400:"5c88336c",93590:"c0ae13c5",93749:"b0c0c0b9",93832:"ee4682e2",93837:"e0b7e94d",94114:"c4a618c6",94123:"1a6f14e8",94136:"acfc65cb",94197:"84ad4950",94223:"0ea8a7ad",94328:"3c453dd8",94337:"b1e4d02c",94401:"310ef608",94452:"890d4cf7",94605:"3a9648f7",94694:"efee88a2",94696:"be43fead",94932:"f4de81f7",95020:"b3c692d6",95107:"84d38a95",95171:"b59fa835",95281:"452f0858",95296:"98493597",95317:"8d86f465",95327:"e8f86b92",95329:"b279cc8c",95364:"31dcafc0",95418:"5d25501d",95441:"589b9d96",95561:"fd1ede1f",95696:"cadd461a",95745:"f8e3a2bf",95801:"0a783e08",95911:"b9556956",95945:"f79f370a",96055:"b6642cba",96078:"531d5024",96082:"b914d28e",96135:"a45b92e2",96188:"6c9f214b",96361:"900f03a7",96426:"d4071849",96535:"d7e854c6",96544:"f92e8b94",96547:"4ffe2ef3",96617:"95351d4b",96684:"5704d749",96772:"c1e0dc45",96831:"c60ddcf0",96945:"98d2ab5d",96971:"ba9e1277",96979:"9303b4cf",97065:"947df423",97129:"00e43607",97334:"f8f4fc7e",97469:"5487219f",97523:"a3b57cd6",97547:"9e232073",97553:"2ab99719",97617:"a15faf99",97782:"c09f3975",97816:"3b68fd86",97826:"d0281c69",97850:"77a305aa",97920:"b5f1ebc2",97955:"ef56c17f",98177:"9bb8538b",98200:"d224c5fb",98218:"290f20df",98272:"69ca16b2",98623:"4774311b",98740:"5e28c252",98791:"f37abbd5",98868:"7936d28a",98939:"dbffc577",99120:"6a0751c9",99184:"9282b3d6",99266:"cb2b1bcd",99367:"c363645f",99389:"b6458274",99427:"dbba9055",99494:"f16b2e8c",99607:"c5ea3f3c",99669:"ce4a8b9f",99839:"19eb08d7",99871:"2f5edb35",99997:"590480c1"}[e]+".js",r.miniCssF=e=>{},r.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),r.o=(e,b)=>Object.prototype.hasOwnProperty.call(e,b),c={},a="@cumulus/website:",r.l=(e,b,f,d)=>{if(c[e])c[e].push(b);else{var t,o;if(void 0!==f)for(var n=document.getElementsByTagName("script"),i=0;i{t.onerror=t.onload=null,clearTimeout(s);var a=c[e];if(delete c[e],t.parentNode&&t.parentNode.removeChild(t),a&&a.forEach((e=>e(f))),b)return b(f)},s=setTimeout(l.bind(null,void 0,{type:"timeout",target:t}),12e4);t.onerror=l.bind(null,t.onerror),t.onload=l.bind(null,t.onload),o&&document.head.appendChild(t)}},r.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.p="/cumulus/",r.gca=function(e){return e={17896441:"27918",21996883:"26086",24647619:"42288",26134010:"15994",38341509:"74139",39579801:"6151",46551803:"16685",54230287:"53243",62041344:"65758",62127933:"16038",65360910:"31013",71078103:"74362",78718572:"41750",79617745:"19204",84960677:"60831",91647079:"36773",92307374:"27554",95169675:"21190",99496549:"31152","906e49ec":"19",f5e3827c:"21",ab971afc:"71","49c587c2":"99","21730a31":"172",a93c3367:"224","23d30d6b":"250","5da0ca7c":"291",a5bcb3f1:"467","9216ce7b":"513",b564874a:"596",c63e6bd5:"803","54d8bddc":"899","0109100f":"1116","66ffc608":"1365",be2f7876:"1387",ef01e1dd:"1523","7981506b":"1647","902d2d1d":"1652","9ecb4d01":"1664",eee57cd1:"1671",a971b35f:"1940","0149cacd":"2044","40d51a61":"2097",e1e17943:"2196",ac4bed99:"2312",fa423b6e:"2427","935116ff":"2638","80631bfd":"2656","60b67194":"2905","7bb83d6b":"2916","9ef1e345":"2989","3bedcc76":"3044","7174660f":"3102",d4d22ad8:"3145",d0a0235c:"3191",ec28562d:"3197","92b043a3":"3216","0b092b5c":"3283","5e94ba2e":"3326",b7343c9b:"3397","1e070b7c":"3398","020a22ba":"3650","6c1d24e1":"3667","7b7fec6b":"3914",c81517c7:"3919",feba251b:"4125","5017cef7":"4151","3c93ed7e":"4195","21cfb395":"4244","38680a69":"4328",f28093e3:"4504",ffa15017:"4513",f2dc10f7:"4585","9654b394":"4631",a3db1255:"4874","4482beb5":"4882","75600d79":"4929",e54b1e77:"5061",d87811ce:"5129",aa4fa4fb:"5313","622596e9":"5352",f2a3bf8e:"5383","631dea17":"5512",d16a2606:"5714","66e9ea68":"5909",d613e1f8:"5920","85e709dc":"5981","391378fa":"6027",a8ef1ed2:"6386","5b4a63ac":"6443","31c3e3d7":"6517","0c99e969":"6537",efc338fe:"6553","111e23e1":"6695","85954f48":"6734",d1284c82:"6799",f49551b9:"6822",cc519fb4:"6824","9e6b2559":"6968",f38fa80d:"6971","7d9c461e":"6978",ff96de6e:"7078",bd1a8573:"7091","30a13577":"7092","365726b0":"7108","7e91f3e1":"7120",bb4987bb:"7155","97ce6959":"7162",b7e69c77:"7318",d8c5fc94:"7451","81f033b8":"7485",bd0e022f:"7500","32d13eb8":"7874","43de05a8":"8023","7d280bdc":"8135","93015a15":"8145","39bddd84":"8188",c565b8da:"8210",d163ea32:"8313",fa8af309:"8328",b4473d93:"8407",f983631a:"8482","2b8a5969":"8638","1b93ff3d":"8671",aa01ca6a:"8809","6fdd5bc4":"8882","6d2c1101":"8906",de8a7b18:"9028",debbc0e2:"9119",e5523a26:"9225","407bcc70":"9235","541bc80d":"9365","0ffc31bc":"9444",cf4d312e:"9542","36edbaa2":"9550","9db3bdac":"9615","14eb3368":"9817","13af1bdb":"9836",bfd6b54b:"9907","7097fbbc":"9947","70cd875c":"10109",cdebfca4:"10228",a7da438d:"10270","05fa5837":"10436","6773ef05":"10497",e8b1baf4:"10650",caf7e36c:"10918","26e1978a":"10987",ba17e21b:"11174",c98c0daa:"11203","2913cae6":"11311",a15a0d8e:"11321","65031edd":"11326","4d2bb41f":"11342",ba5e62dd:"11398","885bf670":"11656","3d446fd0":"11875",b63d08bc:"12228","1fb2401b":"12442","31eb4af1":"12549","885da4ef":"12555","58ac1d26":"12558",a6d8b730:"12560","5aabd190":"12567",b48b6b77:"13253",f65f22ef:"13280","6d905346":"13351","70808dd0":"13460","55920b47":"13588","3be6e3bd":"13595",c642f758:"13617","1ac29206":"13718","8bd7a1a3":"13896",bd0b26a5:"13924","911bbfa4":"13979","0b0df062":"13995","3105dae0":"14061","03902e07":"14088","31585cea":"14095",f71ac404:"14143",af9acd56:"14299",c099652b:"14369","83cbebfb":"14386",d3fe7aed:"14396","79db63f1":"14549","40a26966":"14610","9dd89af2":"14670","763f2b13":"14713","39ed38cd":"14840",f6310963:"14908","4338ab08":"15196",f9c66408:"15497",a97b7821:"15658",a466ebec:"15888","271906a0":"15970","21edad34":"16022","891a9b8f":"16058","6ecc8728":"16071","7d0b3c01":"16153",ff3504dd:"16161","66fe7120":"16379",fdbb9241:"16528",a9347149:"16635",e1bbb98e:"16672",df878b79:"16876","251d94d8":"16973",fe34d639:"17275","45e19d44":"17283",f77885f5:"17457","528fc62e":"17511",c003d460:"17726","00c88225":"17757","3e697946":"17785",b530e783:"17883","996d98f3":"17887","9a71d807":"17989","7f9f61f2":"18025",abc9098e:"18050","29dde6c8":"18084",d19aead5:"18100","2730c631":"18143","5bebce7d":"18156","074a0372":"18186","4e07c49f":"18318","18ccacf6":"18559","07645771":"18734","8282a203":"18746","31793acc":"18883","4d58aa3f":"18892","6e11cc87":"18928",bc4716d5:"18998","584b298a":"19177","86c7426e":"19212","8a064b88":"19305",b7ec56b9:"19408","126e88af":"19427","6f49328c":"19493","83fe529b":"19504","974829b4":"19531","84eafbb4":"19625","3729e987":"19671",c5593510:"19709","2edcde3e":"19733",dc7ad1ac:"19806","17f9c41b":"19832","760eed0f":"19876","8781c463":"19939","6bf1075e":"19962",b73dbab9:"20040",ef0f9e32:"20061",c9664647:"20169","9b7bae35":"20303","1d014bb1":"20602",cbbe4dac:"20689","120dd2fd":"20707","6d0dfc8d":"20764",be3ddcfb:"20911",e01a2739:"20917","4d0df69e":"20983","653c19c7":"21015",ae5e6a48:"21134","6a89e0dd":"21143","0e728709":"21207","0feddf78":"21228",e22055a4:"21379",f18d5795:"21643",aa282e34:"21688",c0ef9e49:"21823","7f2bec55":"21983","43a49a39":"22030",af199e5b:"22087","06ceb223":"22129","9bc49845":"22163","1ca03b4b":"22238","0b0f030b":"22456",ef2624d8:"22523","0e8c522c":"22540","66b5d69c":"22583",a8f480dd:"22604","4995f874":"22777","6775be7c":"22898",fe423ebe:"22940","9e91305d":"22997","500c9b63":"23064",ede8882f:"23228","909cadf6":"23231","8113fd14":"23252",c1f9ba1e:"23310","89c49d10":"23320","57b7b037":"23343","23896e06":"23435","2c9f485d":"23522",edbf4496:"23536","2457e7c2":"23545",c7599d12:"23663","332c497c":"23714","99961c3d":"23804","9a02f8a7":"23898",f1d5089f:"24058","92ce2bd2":"24066","15d86f95":"24101","0c48ef63":"24109",b7738a69:"24158",a8565f1f:"24266","395508da":"24282",fbfa5dfc:"24401","7fc9e2ed":"24467",e6de5f28:"24501",fd378320:"24946","7d30361b":"24986","10fd89ee":"25079","6c0ce6d0":"25251","8b2f7dd6":"25283","2b1e7b76":"25427",b9b67b35:"25433","59740e69":"25451",f265d6a5:"25513","06673fe1":"25547","75071a94":"25579","9427c683":"25833",bd61737f:"25898",cc7818bb:"26067","85860bdc":"26084",a958884d:"26201","3bed40a0":"26291",f35b8c8b:"26311",ec205789:"26521","08985b86":"26654","5667bf50":"26686","3be4d1c2":"26695",ceb6bd62:"26858",bb1d1845:"27109","6d03c6cb":"27167",e022cd8b:"27270",ec2b56b1:"27276",ec11103c:"27303","552bb95e":"27324","0f6a2fca":"27704","865c04d0":"27982","56405cb8":"28085","916fb87b":"28134","95771e39":"28139","50e78136":"28219","9ce40ebc":"28261","7f6814ed":"28367",fc338eb2:"28475","2f4d1edb":"28476",db082e36:"28514","8af04d56":"28516","018243f8":"28623","4a0c84c3":"28699",f6ca5dc0:"28800","2f74be58":"28880","8d83f575":"28882","48c7b3a1":"28906","3417a016":"28922",da9049b8:"29014","8f32218b":"29025","44573fa4":"29050",e5977951:"29066",a670ed1c:"29131","6fe0ccd0":"29191",f5da8015:"29272","1be78505":"29514","4ef1f024":"29520","949a554a":"29698",cff5e41a:"29717","16a52e74":"29782","9e530f0a":"29818","3291c538":"29831",b604d5b2:"29864","729f5dd4":"29871","2c86cbaa":"29886","14e9211b":"29899","3d8cf439":"29978",c32e37fe:"29980","26db341a":"30062","04829abe":"30216",ff9e51b7:"30295","26bc6c41":"30419","1dc72111":"30433","019a0579":"30454","73c32a6c":"30470","10b7b761":"30589","4f643cbc":"30677","8dc6ea19":"30678","534db397":"30800","845c1fa7":"30820","8900c226":"30834","6b685afe":"30837","0c9e4d11":"30865","8d5884d6":"30885","683d9354":"30979",c8b95361:"31009",d1036fb2:"31023","9b98b06f":"31044","23c664e3":"31050","928e95c7":"31068","5e56d481":"31089",abe8f5f4:"31116","8793e9e6":"31187",cc976a0e:"31293","212ceae2":"31294","347c8874":"31441","2d7d2510":"31471","8a75859c":"31512",ee2f6eec:"31516","6c6d8053":"31570",ce861b37:"31671","9f850ab3":"31824","87719f86":"32224","570c64c0":"32319","09e7c68c":"32410","0eb0d7dd":"32446","6167ec10":"32491","5d8d28d6":"32567",f8c45ac9:"32652",a5461ca4:"32689","46d1dc13":"32839",f1c17b7f:"32872",e9268009:"32892","0ef4df13":"32914","9fcb81d2":"33023",a9776c25:"33076","95f7392c":"33083",ad516382:"33131",dd0c884c:"33138",cab767d9:"33178",fa17a3e5:"33181","5af48372":"33223","3deda206":"33260","82dec33c":"33261","9ebfae5b":"33329","765a551b":"33407","586fa356":"33725","5b659de8":"33889","1943e34c":"33920","9b00304e":"34020","3db5eb91":"34077","273b8e1f":"34079","4a797306":"34153","23a156eb":"34206",ff318c38:"34293",f8338e5f:"34294",e48c3912:"34323",c4a71dd9:"34407",f0f4a691:"34458","5c8ad115":"34460","5bea2473":"34475","592e779d":"34552",de061f48:"34590",c93364c6:"34647","813ebe83":"34656","71408d45":"34748","116bb944":"34766","243071a0":"34784","5c392fa5":"34792","99a27b29":"34800","16046cb7":"34882","2c06af7c":"34943","9c12417e":"34979",a2bcabb3:"35038",b269633b:"35069","3576f003":"35214","5334bf47":"35216","907c8c6a":"35387","1cf42300":"35466","09e24d74":"35577","032d72a0":"35614",a2e876c5:"35647","90bfd346":"35768",c30c381e:"35809",df463adb:"35874","41f4b8cc":"35879",b3a22aab:"36009",ade0010f:"36312",b3fdbb6a:"36495",f3d03ec8:"36511","8d4185e0":"36673",eb87086a:"36933","63849fd3":"36935","7c43c98e":"36983","6d92a4b5":"37021",ac6b62e9:"37055","6e357be7":"37058",c3a94ed1:"37208","4f4166ed":"37257","229edc10":"37316","6dfd1bfa":"37362","3d7b9a1b":"37426","7779798d":"37894",febe4bf0:"37977","5bb043f7":"38056",b34a9ee0:"38104","80b5c97d":"38230","8e018081":"38333","11414e0b":"38368",a77f15f9:"38450","66cd2d70":"38469","0df0bc38":"38504",e80537c2:"38591","5eece5ec":"38679","38cd2ebb":"38768","2f6d8a46":"38792","0cb88ec0":"38819","179d37d3":"38873","7aabbdee":"39033","3ae213b8":"39177",f55bfda4:"39209",d179e89e:"39252",c0ba661c:"39275",a072c73d:"39325",b7e5badb:"39368","22f9ccca":"39605",e6e9a3aa:"39645",f2c01e3a:"39726","8277cea1":"39820","73c3a5ed":"39853",f8904416:"39941",fa8dc2e8:"39972","5b34f9ea":"39978",f49b74d5:"40097","2d7caf96":"40158","0260d23f":"40176","7ad00ade":"40342",dd8797f2:"40365",c2ef5f99:"40665",eaaaa138:"40830","51cdab7b":"40930",f5d7fbaf:"40936","0cbb6061":"40986",ba1c1ac8:"41100","9444e723":"41329","7945275b":"41388",e6241e03:"41537","5b7c576e":"41840","56181a0b":"41863","85db7b61":"41954",e4176d9e:"41958","81192af7":"41998",a8987ce3:"42051","10c43c6e":"42054",bfe6bb1f:"42059","672c9486":"42169","4b66f540":"42187","3ebe5c8a":"42226","909a3395":"42263","48e254a2":"42289",ff4be603:"42371","4447d079":"42436",c14e35a5:"42465",a2ff0b9e:"42551",ff7c02a9:"42609",b6cfa9b7:"42620","4b481283":"42690","8e0282b7":"42721","910f748a":"42728","699b0913":"42757","608d6ba6":"42930",e2e305b4:"43037",ea09532f:"43047",dc98fcfb:"43072",f6d93f4d:"43294","87186dce":"43529","39befbbe":"43554","0c94161c":"43635",d7039a99:"43645","4b718ce0":"43697","0682e49e":"43793","8b15c55c":"43849",bfada16a:"43919",b99800de:"43966","0e0b668d":"44023","4b0528ed":"44029","5d86b3d6":"44118",f193e9f7:"44152","5b1c4ba7":"44174",dfca4314:"44393","3d99ef33":"44523","0a54392a":"44765","7e328509":"44797","55a23a94":"44860","0f014490":"44907","46f76bef":"45057",dacae080:"45091","593ffe68":"45114","01f7e848":"45279","8b1145e2":"45287",a258685b:"45571","3476fe8e":"45583",d02f7bc4:"45593",f2abaee2:"45732","239111c7":"45786","83d061ac":"45809","9ee45729":"45878","5216f17c":"46023",e0eae934:"46045",fff3ab69:"46074","8d0344ba":"46218",a5b5d55c:"46284","2fc02015":"46328",e7478c24:"46447","46dcda29":"46838","33a34e3b":"46901",cbbdf9a2:"47062",ba73f26c:"47068",cc1f5ce8:"47082","8b6445a0":"47117","524b67e3":"47276",cf1567e8:"47287",e5c3dfde:"47463","3059ed75":"47568","9ee4ebe9":"47582",ee799351:"47655","3ab425d2":"47708","2f0ee63c":"47838","497aa321":"47975","9a7b56f5":"47986","38bd3ddb":"48031","6f6b3e89":"48150",c81622cc:"48218",bf0d24cf:"48320","3ddb8349":"48426",abfd17f9:"48840","70f3cfb0":"49096",d1b82434:"49169",f99bfa77:"49241","8eed67ba":"49270","98a6ff5a":"49874",ab2e7268:"50017","8ac39bbe":"50052","4d028f11":"50145",e51da90c:"50153","486e741e":"50193","7b4c719b":"50240",acb04c32:"50337","7ce5ebd9":"50362",e86c0d05:"50375","9f305eae":"50437","40a0c599":"50472",aba6a826:"50525","7bcf009a":"50773",d7e1d518:"50849","6f93a078":"50999",e91074f3:"51555","12e76d03":"51574","6f219482":"51593",ea82a261:"51605","3b5ffa57":"51625",af6e989f:"51706",e6fe050f:"51768","42a4a45b":"51830",b63fdeeb:"51840","86a7da57":"51945","22a76d89":"52094","08ba51c1":"52126","92bceb62":"52251","79bae4c5":"52286",fcb00301:"52491","3b1e54e9":"52499","2006be57":"52573",a18114c4:"52586","28599d52":"52593",c04dcf0d:"52715","0e46f7bf":"52789",f888d9d8:"52870","1df93b7f":"53237",c55f973e:"53371","6cd64148":"53442","3ca132b1":"53675",f20f879f:"53823",b684abf7:"54125","6afbfa44":"54133","1632abda":"54178","4c8d1cae":"54210",b3c952b5:"54250",d6f7d5e2:"54265",fa5bdf0c:"54363","4bae0029":"54382","3034400c":"54397","612ebb8a":"54487",cca83a59:"54513",e0668c88:"54591","130a23fd":"54756",fd67079f:"54778",ed07f994:"54786",dd8be3b2:"54794","32f0f819":"54855",a463ff81:"55043","5560d84e":"55216","661e4fa4":"55239","08472b2d":"55273","746f419e":"55335","1710d498":"55478","8e23b856":"55552","7ec28fd9":"55693","7f536709":"55726","2e18dbc8":"55745","676a3180":"55821","407fa3a0":"55925",f2d325f1:"55962","640fe435":"56290",ac4fb807:"56424",e8d36425:"56513","71f8452f":"56614",f1525ef1:"56750","151869e3":"56795",d4a6dda9:"56902","918ae6ff":"57121","7f039048":"57126","522a40f8":"57242","1b4282d0":"57258",fb218ddd:"57293","3ad7b662":"57341",f251ab77:"57489",e4b4615d:"57598","6e586ee3":"57599",d06effa9:"57699","34660ac5":"57749","163044ef":"57780",a3c98c45:"57820","84c320c1":"58009","4893a1cb":"58042",e345afee:"58096",a045168c:"58182","09e9a7df":"58197","6145eda0":"58234","551b313a":"58247","649a76e7":"58356",ea41aad0:"58564","4f9404e5":"58768",cc6053aa:"58818",cf14af90:"58822",bf2622dd:"58824","68709c70":"58914","8938295e":"59051",de11ece8:"59060","010f8398":"59181",af049e12:"59191","07a6f1c2":"59241","8962034b":"59248","902aff6f":"59336",d959d974:"59342",b43aa387:"59427","0cd38f48":"59442","081ed9af":"59496",c2ed794e:"59506","897798e8":"59533",d243562e:"59592",e56a1a2c:"59771","619d2e79":"59900",f929d4df:"59982","918c9b38":"59992","78f8003c":"60000","7b2e834b":"60185","34d5cc00":"60331","05a720dd":"60434",bb341369:"60518",b8677fbf:"60603","1ebc7fe2":"60682","16bb304a":"60868",a79d55be:"61007","4c13f84f":"61200",f8bc4080:"61210",f497508e:"61249",e50573ba:"61271",a529f863:"61361",e31a63b7:"61543",a882bd74:"61643","5e52bbeb":"61793","83a26c48":"62117","912fcb5a":"62235","5c77ea5f":"62307",bdd03912:"62318","8e993d66":"62319","686c1ad3":"62474","6312a106":"62493","54d1c079":"62523",bc1c33e4:"62547","8fca97e0":"62655","34d502be":"62867",c38f23a9:"62948","6dffe7c4":"62983",f29affbe:"63105","877a3c1c":"63387","7bc70741":"63604",cb582f54:"63663","92264b81":"63777","010118f9":"63801","86c8f7cd":"63808",cacfa11d:"64000",d6990b47:"64032","555f2cec":"64072",e8591f69:"64100","07b92fc6":"64122",e715560c:"64485","6694e7e9":"64525",fd0f74aa:"64533",eee9e2f1:"64596","300bd484":"64656",e74888b9:"64754","5a7e5a43":"64766",dfd588b8:"64824","42d0afac":"64871","610e19f0":"64900",b81f3fb0:"64967","637ec626":"65132","8d3be60d":"65501",ef0f3981:"65592","88dde0bb":"65594","90dccef4":"65641","21d2296a":"65965","06b7cd3c":"66026","9d336f66":"66081",cf494ba6:"66187","1fb9ab5c":"66238","5f0246ae":"66256",a5560bad:"66303","87cc8f7c":"66336","1d642165":"66462","9c53d859":"66465",e0052e0c:"66597","12b52520":"66958","49763a0b":"67010",d7124adb:"67061",c7b80b67:"67069","15f4efbb":"67132","2d8700b9":"67343","85ac525a":"67448","18caf9ef":"67583","1693c0b8":"67597",a48eac25:"67638",a23744f9:"67670",d29db0e3:"67908","8d96489a":"67954","8858d0ce":"68034",db9653b1:"68126",f301134a:"68258","8751004c":"68689","5335ef4f":"68757","9b89ba00":"68793",c3ca7a6a:"68823",ae5838f0:"68943","1a54bfd0":"69111",a5f4c814:"69125","193f200e":"69209","212137e2":"69234","140e6a69":"69254","9bfbb8bc":"69628",d5d7628b:"69629","607292ce":"69647","7facae8f":"69843","6d933e1d":"69899","41e02281":"70081","16b47049":"70130","06b3b671":"70178","7bd49e6c":"70249",d72ada40:"70277","1fc4ed50":"70367",f53e2381:"70504",ce79b72a:"70543","65306ecf":"70614","21d3c1c7":"70706","06876062":"70964","99c371aa":"71081",dff7b4e8:"71160","5ed92a05":"71169","167f5be9":"71287",c26ab7d5:"71476",dcf1d6e7:"71516",d93a0aad:"71544",ff9d88b6:"71698",fcfa677e:"71789","39afc900":"71811","59dfcfb5":"72070","7f31124b":"72189","66179fb5":"72331",cf945ce5:"72500","084a18af":"72613","1adeac4a":"72638","150a4d14":"72740","2e2a73ec":"72887",d703ca6f:"72952",d9b3adf3:"72957","2e6d047c":"72978","1eec97be":"73238","4efa0483":"73300",aa02927d:"73338",b26f6fa9:"73356","6594bd70":"73369","63b8176f":"73452",c7953305:"73537","12dcfbad":"73618","8f9c5733":"73725","154cbeb4":"73745",af29c71b:"73766","566ea6d5":"73975",ff35d8ff:"74019",a0541488:"74061","67e63bc0":"74091",ba454016:"74132",ef4e0f5d:"74136","769f97b7":"74332","4c8fc79c":"74441",e6a17fa0:"74465","4bbc58d4":"74480","80ea5ae7":"74578","016f0e16":"74794","8b87f6f5":"74839","5b34d9eb":"75070","29ff1658":"75118","8d81369e":"75123","61c61e17":"75203",ca1d44bc:"75273",c176dc4f:"75546","43a232e9":"75567",ff078e30:"75671","1ac49947":"75702","64e30bbc":"75897",b92bff04:"75911",d6f3938e:"76029","172c9869":"76180","99ae7254":"76222","02de7b5a":"76240","2363ed29":"76355","4f8fd4be":"76360",c0f7075f:"76369","359e34b0":"76527",d12dbf4d:"76793","198182f0":"76878","8d493a07":"76895",cb870251:"76924","9eb4c1b4":"77053","60e8b504":"77170","4deae4de":"77413","78dc40c2":"77427","56c932ee":"77527","274eaedf":"77655","60043c0d":"77680",be698a2c:"77887","971cbe2f":"77923","1ae50e88":"78035",c80936bd:"78063",ed51eb7d:"78072","516dec85":"78114","41742cda":"78233",df22f3af:"78248","118229e6":"78452","2b5e4b34":"78863","0fcbeed9":"78924","69f3d9b5":"78991",a17fb62b:"79073","95f18dd4":"79164",f83967c4:"79290","10c28d6f":"79298","73a7bd5f":"79356",df5a3016:"79478","27e1a14b":"79479",fbff3b11:"79543",c733e485:"79691","6b2b8280":"79829","4bedd8c5":"79895",b00a2879:"79958","092519d2":"79963","935f2afb":"80053","18dd253f":"80132",fc8aebe3:"80192",eac8f2ef:"80268","2d35b91c":"80380","83cd8f20":"80709",f97cc188:"80895",abf6a1f1:"80940","0c1ee94a":"81135","2dd65ece":"81477",a39041db:"81667","82a4f002":"81708","6a0b4355":"81835","35fa8025":"81934","919b108c":"82132","51da09c7":"82250",dc130668:"82342","3f6554cb":"82348",d885d629:"82360",fadcaea6:"82423","7bcf7096":"82513",aa395a59:"82614",ba4efbe0:"82621","3ebee193":"82864",bf02c3ce:"82982","39b565ff":"82989","39c8ecdc":"83069","1a42aba3":"83074",a26f7afa:"83147","779753bc":"83692","66716ec1":"83893","10f908b7":"83957","363318d5":"84063","737371dd":"84097","6e366b57":"84242","8da7304d":"84362","3b12bc8a":"84477","6eeb04e2":"84500","6cb122e3":"84513","4cec253a":"84710","42325f5c":"84745","211f58b1":"84754","7e5ee96c":"84841","5c27dd68":"84847","34b19815":"84854",e9d5739e:"84888","21bf64ca":"84941","6e5d074b":"85011","7bc3feb7":"85054","60d04b47":"85098",f8482b2c:"85217","3743f01c":"85419","7e446cc1":"85455","6d480200":"85493","82033eb7":"85780",c9b79676:"86009",ed809cac:"86018",b47406fa:"86129","1db21d86":"86150","08e3aaa9":"86333","4ad39569":"86356","45aa7127":"86476",ce66b6fd:"86518","7dd3be25":"86826","4e1da517":"86950",a6a8af40:"87064","27bd5328":"87223","6fc8d865":"87224","1fdab62e":"87240","9980f90c":"87304","96c0bb00":"87313",a48778d9:"87316","96ec050b":"87443","7668acae":"87460","8faa0fb1":"87482",a291f403:"87513",f807eec9:"87634",c07f2717:"87667","3958a146":"87799",e111f111:"87836",b1998bb1:"87866",e5842021:"88179","1e391540":"88187",c0074ddd:"88252","21ad5224":"88295",e1b9986a:"88338",ac930f6e:"88380","3c725018":"88446","6827856d":"88598","4499569c":"88621","6f59957c":"88625","41db9914":"88821","1c56d006":"88831","32ea4ecb":"88879","4455e85b":"89002",b4028749:"89210","13b69fa8":"89574","46c600d5":"89780","443045da":"89806","1f79049f":"89852",fee1f25c:"89986",d25ffd5f:"89987",d043cc46:"90046","41b3e733":"90185",f2497893:"90333","2c91f584":"90342",f60e43ec:"90392","0b78393d":"90398",dd313590:"90431","0a13c98e":"90451",b2335bc1:"90464","73dfc993":"90536","01fb8b11":"90560","22f40a40":"90601","147b0f6a":"90610","8cd0f4f5":"90615","6601f604":"90645","459a783a":"90666","87e7806e":"90865","4302562a":"90896","6eb0ce42":"90976","9c4bbfc4":"91178",ff0539a2:"91213","28b27838":"91231","76ace0dc":"91274","02ee0502":"91287","872e63de":"91304","4cd7d8af":"91406","1671b3fa":"91425",d692bb25:"91523",c839a5b0:"91628","2f535455":"91753","304ed800":"91782",b2735041:"91849","000c061a":"92085","8c828746":"92244","9c42de85":"92269",db5c8692:"92393","799b872c":"92404",d6360c39:"92456","4d4093bb":"92463","6eebf72d":"92744","7861f6df":"92775","8c31caf6":"92778",ae5bb339:"92786","85c3ba36":"92843","8bfba65b":"92851","14e00221":"92964",b984322c:"93023","61e5c5b8":"93071",e7cbe8da:"93151",f7101d4f:"93176","740eb29c":"93195",b83df1bc:"93308","5d075efb":"93340",f7735fb0:"93346",dd435828:"93377","03e8549c":"93400",dede40b0:"93590","4e6907d6":"93749","917734f8":"93832",cb341380:"93837",c9aea766:"94114","7c8407dd":"94123","91dc98f0":"94136","37aba5d3":"94197","43b891d1":"94223","63f66cb7":"94328","9fdf7324":"94337","6c10648f":"94401","878356ab":"94452","487f7f30":"94605",d3e690ce:"94694","376d31f7":"94696",a233fb97:"94932",b8e39b95:"95020",d666ab7e:"95107","3db8c88b":"95171",bc08bf79:"95281","9936b6c8":"95296",cf282674:"95317","1e173bbe":"95327","5b23c695":"95329","41fbfe2f":"95364","7877b0eb":"95418",e9ef6b31:"95441","0e0f5dd2":"95561","8462ad7a":"95696",edf19300:"95745",e490fd18:"95801","7e254f9d":"95911","90b0cf6d":"95945","8fa500ae":"96055",d6011437:"96078",a322018d:"96082","3061ad92":"96135",f0129862:"96188",ebf2bdda:"96361","64bd79cb":"96426","38e65fdd":"96535","49ea6ca5":"96544","385bc71d":"96547",e23cd647:"96617",a612420b:"96684",b35418cf:"96772","99ba663e":"96831","09e11ac0":"96945","57973c2b":"96971","7f6f8f16":"96979","6816f4c0":"97065",f3034cf4:"97129","9d4bcb9a":"97334",d91e7ab4:"97469","02fbc840":"97523","902fdb3b":"97547","7ea214d5":"97553",ed97cef0:"97617",b094b997:"97782","7513b789":"97816","16cff1eb":"97826",dd6685df:"97850","1a4e3797":"97920","746bf890":"97955","049dc708":"98177","0e7f2915":"98200","1820eb3b":"98218",b7f629d0:"98272",ced65f67:"98623",d1475ab1:"98740","1a6f209f":"98791","6a913ab1":"98868","3ff950a4":"98939","008b0ccc":"99120","8aecb2ef":"99184",ca443c18:"99266","00125b11":"99367",c2f4aca4:"99389","64758f43":"99427",f2d5637b:"99494","49ea4a42":"99607","32db5af4":"99669","15d4dc80":"99839","5e3def70":"99871",b63b5bb9:"99997"}[e]||e,r.p+r.u(e)},(()=>{var e={51303:0,40532:0};r.f.j=(b,f)=>{var c=r.o(e,b)?e[b]:void 0;if(0!==c)if(c)f.push(c[2]);else if(/^(40532|51303)$/.test(b))e[b]=0;else{var a=new Promise(((f,a)=>c=e[b]=[f,a]));f.push(c[2]=a);var d=r.p+r.u(b),t=new Error;r.l(d,(f=>{if(r.o(e,b)&&(0!==(c=e[b])&&(e[b]=void 0),c)){var a=f&&("load"===f.type?"missing":f.type),d=f&&f.target&&f.target.src;t.message="Loading chunk "+b+" failed.\n("+a+": "+d+")",t.name="ChunkLoadError",t.type=a,t.request=d,c[1](t)}}),"chunk-"+b,b)}},r.O.j=b=>0===e[b];var b=(b,f)=>{var c,a,[d,t,o]=f,n=0;if(d.some((b=>0!==e[b]))){for(c in t)r.o(t,c)&&(r.m[c]=t[c]);if(o)var i=o(r)}for(b&&b(f);n{"use strict";var e,b,f,c,a,d={},t={};function r(e){var b=t[e];if(void 0!==b)return b.exports;var f=t[e]={id:e,loaded:!1,exports:{}};return d[e].call(f.exports,f,f.exports,r),f.loaded=!0,f.exports}r.m=d,r.c=t,e=[],r.O=(b,f,c,a)=>{if(!f){var d=1/0;for(i=0;i=a)&&Object.keys(r.O).every((e=>r.O[e](f[o])))?f.splice(o--,1):(t=!1,a0&&e[i-1][2]>a;i--)e[i]=e[i-1];e[i]=[f,c,a]},r.n=e=>{var b=e&&e.__esModule?()=>e.default:()=>e;return r.d(b,{a:b}),b},f=Object.getPrototypeOf?e=>Object.getPrototypeOf(e):e=>e.__proto__,r.t=function(e,c){if(1&c&&(e=this(e)),8&c)return e;if("object"==typeof e&&e){if(4&c&&e.__esModule)return e;if(16&c&&"function"==typeof e.then)return e}var a=Object.create(null);r.r(a);var d={};b=b||[null,f({}),f([]),f(f)];for(var t=2&c&&e;"object"==typeof t&&!~b.indexOf(t);t=f(t))Object.getOwnPropertyNames(t).forEach((b=>d[b]=()=>e[b]));return d.default=()=>e,r.d(a,d),a},r.d=(e,b)=>{for(var f in b)r.o(b,f)&&!r.o(e,f)&&Object.defineProperty(e,f,{enumerable:!0,get:b[f]})},r.f={},r.e=e=>Promise.all(Object.keys(r.f).reduce(((b,f)=>(r.f[f](e,b),b)),[])),r.u=e=>"assets/js/"+({19:"906e49ec",21:"f5e3827c",71:"ab971afc",99:"49c587c2",172:"21730a31",224:"a93c3367",250:"23d30d6b",291:"5da0ca7c",467:"a5bcb3f1",513:"9216ce7b",596:"b564874a",803:"c63e6bd5",899:"54d8bddc",1116:"0109100f",1365:"66ffc608",1387:"be2f7876",1523:"ef01e1dd",1647:"7981506b",1652:"902d2d1d",1664:"9ecb4d01",1671:"eee57cd1",1940:"a971b35f",2044:"0149cacd",2097:"40d51a61",2196:"e1e17943",2312:"ac4bed99",2427:"fa423b6e",2638:"935116ff",2656:"80631bfd",2905:"60b67194",2916:"7bb83d6b",2989:"9ef1e345",3044:"3bedcc76",3102:"7174660f",3145:"d4d22ad8",3191:"d0a0235c",3197:"ec28562d",3216:"92b043a3",3283:"0b092b5c",3326:"5e94ba2e",3397:"b7343c9b",3398:"1e070b7c",3650:"020a22ba",3667:"6c1d24e1",3914:"7b7fec6b",3919:"c81517c7",4125:"feba251b",4151:"5017cef7",4195:"3c93ed7e",4244:"21cfb395",4328:"38680a69",4504:"f28093e3",4513:"ffa15017",4585:"f2dc10f7",4631:"9654b394",4874:"a3db1255",4882:"4482beb5",4929:"75600d79",5061:"e54b1e77",5129:"d87811ce",5313:"aa4fa4fb",5352:"622596e9",5383:"f2a3bf8e",5512:"631dea17",5714:"d16a2606",5909:"66e9ea68",5920:"d613e1f8",5981:"85e709dc",6027:"391378fa",6151:"39579801",6386:"a8ef1ed2",6443:"5b4a63ac",6517:"31c3e3d7",6537:"0c99e969",6553:"efc338fe",6695:"111e23e1",6734:"85954f48",6799:"d1284c82",6822:"f49551b9",6824:"cc519fb4",6968:"9e6b2559",6971:"f38fa80d",6978:"7d9c461e",7078:"ff96de6e",7091:"bd1a8573",7092:"30a13577",7108:"365726b0",7120:"7e91f3e1",7155:"bb4987bb",7162:"97ce6959",7318:"b7e69c77",7451:"d8c5fc94",7485:"81f033b8",7500:"bd0e022f",7874:"32d13eb8",8023:"43de05a8",8135:"7d280bdc",8145:"93015a15",8188:"39bddd84",8210:"c565b8da",8313:"d163ea32",8328:"fa8af309",8407:"b4473d93",8482:"f983631a",8638:"2b8a5969",8671:"1b93ff3d",8809:"aa01ca6a",8882:"6fdd5bc4",8906:"6d2c1101",9028:"de8a7b18",9119:"debbc0e2",9225:"e5523a26",9235:"407bcc70",9365:"541bc80d",9444:"0ffc31bc",9542:"cf4d312e",9550:"36edbaa2",9615:"9db3bdac",9817:"14eb3368",9836:"13af1bdb",9907:"bfd6b54b",9947:"7097fbbc",10109:"70cd875c",10228:"cdebfca4",10270:"a7da438d",10436:"05fa5837",10497:"6773ef05",10650:"e8b1baf4",10918:"caf7e36c",10987:"26e1978a",11174:"ba17e21b",11203:"c98c0daa",11311:"2913cae6",11321:"a15a0d8e",11326:"65031edd",11342:"4d2bb41f",11398:"ba5e62dd",11656:"885bf670",11875:"3d446fd0",12228:"b63d08bc",12442:"1fb2401b",12549:"31eb4af1",12555:"885da4ef",12558:"58ac1d26",12560:"a6d8b730",12567:"5aabd190",13253:"b48b6b77",13280:"f65f22ef",13351:"6d905346",13460:"70808dd0",13588:"55920b47",13595:"3be6e3bd",13617:"c642f758",13718:"1ac29206",13896:"8bd7a1a3",13924:"bd0b26a5",13979:"911bbfa4",13995:"0b0df062",14061:"3105dae0",14088:"03902e07",14095:"31585cea",14143:"f71ac404",14299:"af9acd56",14369:"c099652b",14386:"83cbebfb",14396:"d3fe7aed",14549:"79db63f1",14610:"40a26966",14670:"9dd89af2",14713:"763f2b13",14840:"39ed38cd",14908:"f6310963",15196:"4338ab08",15497:"f9c66408",15658:"a97b7821",15888:"a466ebec",15970:"271906a0",15994:"26134010",16022:"21edad34",16038:"62127933",16058:"891a9b8f",16071:"6ecc8728",16153:"7d0b3c01",16161:"ff3504dd",16379:"66fe7120",16528:"fdbb9241",16635:"a9347149",16672:"e1bbb98e",16685:"46551803",16876:"df878b79",16973:"251d94d8",17275:"fe34d639",17283:"45e19d44",17457:"f77885f5",17511:"528fc62e",17726:"c003d460",17757:"00c88225",17785:"3e697946",17883:"b530e783",17887:"996d98f3",17989:"9a71d807",18025:"7f9f61f2",18050:"abc9098e",18084:"29dde6c8",18100:"d19aead5",18143:"2730c631",18156:"5bebce7d",18186:"074a0372",18318:"4e07c49f",18559:"18ccacf6",18734:"07645771",18746:"8282a203",18883:"31793acc",18892:"4d58aa3f",18928:"6e11cc87",18998:"bc4716d5",19177:"584b298a",19204:"79617745",19212:"86c7426e",19305:"8a064b88",19408:"b7ec56b9",19427:"126e88af",19493:"6f49328c",19504:"83fe529b",19531:"974829b4",19625:"84eafbb4",19671:"3729e987",19709:"c5593510",19733:"2edcde3e",19806:"dc7ad1ac",19832:"17f9c41b",19876:"760eed0f",19939:"8781c463",19962:"6bf1075e",20040:"b73dbab9",20061:"ef0f9e32",20169:"c9664647",20303:"9b7bae35",20602:"1d014bb1",20689:"cbbe4dac",20707:"120dd2fd",20764:"6d0dfc8d",20911:"be3ddcfb",20917:"e01a2739",20983:"4d0df69e",21015:"653c19c7",21134:"ae5e6a48",21143:"6a89e0dd",21190:"95169675",21207:"0e728709",21228:"0feddf78",21379:"e22055a4",21643:"f18d5795",21688:"aa282e34",21823:"c0ef9e49",21983:"7f2bec55",22030:"43a49a39",22087:"af199e5b",22129:"06ceb223",22163:"9bc49845",22238:"1ca03b4b",22456:"0b0f030b",22523:"ef2624d8",22540:"0e8c522c",22583:"66b5d69c",22604:"a8f480dd",22777:"4995f874",22898:"6775be7c",22940:"fe423ebe",22997:"9e91305d",23064:"500c9b63",23228:"ede8882f",23231:"909cadf6",23252:"8113fd14",23310:"c1f9ba1e",23320:"89c49d10",23343:"57b7b037",23435:"23896e06",23522:"2c9f485d",23536:"edbf4496",23545:"2457e7c2",23663:"c7599d12",23714:"332c497c",23804:"99961c3d",23898:"9a02f8a7",24058:"f1d5089f",24066:"92ce2bd2",24101:"15d86f95",24109:"0c48ef63",24158:"b7738a69",24266:"a8565f1f",24282:"395508da",24401:"fbfa5dfc",24467:"7fc9e2ed",24501:"e6de5f28",24946:"fd378320",24986:"7d30361b",25079:"10fd89ee",25251:"6c0ce6d0",25283:"8b2f7dd6",25427:"2b1e7b76",25433:"b9b67b35",25451:"59740e69",25513:"f265d6a5",25547:"06673fe1",25579:"75071a94",25833:"9427c683",25898:"bd61737f",26067:"cc7818bb",26084:"85860bdc",26086:"21996883",26201:"a958884d",26291:"3bed40a0",26311:"f35b8c8b",26521:"ec205789",26654:"08985b86",26686:"5667bf50",26695:"3be4d1c2",26858:"ceb6bd62",27109:"bb1d1845",27167:"6d03c6cb",27270:"e022cd8b",27276:"ec2b56b1",27303:"ec11103c",27324:"552bb95e",27554:"92307374",27704:"0f6a2fca",27918:"17896441",27982:"865c04d0",28085:"56405cb8",28134:"916fb87b",28139:"95771e39",28219:"50e78136",28261:"9ce40ebc",28367:"7f6814ed",28475:"fc338eb2",28476:"2f4d1edb",28514:"db082e36",28516:"8af04d56",28623:"018243f8",28699:"4a0c84c3",28800:"f6ca5dc0",28880:"2f74be58",28882:"8d83f575",28906:"48c7b3a1",28922:"3417a016",29014:"da9049b8",29025:"8f32218b",29050:"44573fa4",29066:"e5977951",29131:"a670ed1c",29191:"6fe0ccd0",29272:"f5da8015",29514:"1be78505",29520:"4ef1f024",29698:"949a554a",29717:"cff5e41a",29782:"16a52e74",29818:"9e530f0a",29831:"3291c538",29864:"b604d5b2",29871:"729f5dd4",29886:"2c86cbaa",29899:"14e9211b",29978:"3d8cf439",29980:"c32e37fe",30062:"26db341a",30216:"04829abe",30295:"ff9e51b7",30419:"26bc6c41",30433:"1dc72111",30454:"019a0579",30470:"73c32a6c",30589:"10b7b761",30677:"4f643cbc",30678:"8dc6ea19",30800:"534db397",30820:"845c1fa7",30834:"8900c226",30837:"6b685afe",30865:"0c9e4d11",30885:"8d5884d6",30979:"683d9354",31009:"c8b95361",31013:"65360910",31023:"d1036fb2",31044:"9b98b06f",31050:"23c664e3",31068:"928e95c7",31089:"5e56d481",31116:"abe8f5f4",31152:"99496549",31187:"8793e9e6",31293:"cc976a0e",31294:"212ceae2",31441:"347c8874",31471:"2d7d2510",31512:"8a75859c",31516:"ee2f6eec",31570:"6c6d8053",31671:"ce861b37",31824:"9f850ab3",32224:"87719f86",32319:"570c64c0",32410:"09e7c68c",32446:"0eb0d7dd",32491:"6167ec10",32567:"5d8d28d6",32652:"f8c45ac9",32689:"a5461ca4",32839:"46d1dc13",32872:"f1c17b7f",32892:"e9268009",32914:"0ef4df13",33023:"9fcb81d2",33076:"a9776c25",33083:"95f7392c",33131:"ad516382",33138:"dd0c884c",33178:"cab767d9",33181:"fa17a3e5",33223:"5af48372",33260:"3deda206",33261:"82dec33c",33329:"9ebfae5b",33407:"765a551b",33725:"586fa356",33889:"5b659de8",33920:"1943e34c",34020:"9b00304e",34077:"3db5eb91",34079:"273b8e1f",34153:"4a797306",34206:"23a156eb",34293:"ff318c38",34294:"f8338e5f",34323:"e48c3912",34407:"c4a71dd9",34458:"f0f4a691",34460:"5c8ad115",34475:"5bea2473",34552:"592e779d",34590:"de061f48",34647:"c93364c6",34656:"813ebe83",34748:"71408d45",34766:"116bb944",34784:"243071a0",34792:"5c392fa5",34800:"99a27b29",34882:"16046cb7",34943:"2c06af7c",34979:"9c12417e",35038:"a2bcabb3",35069:"b269633b",35214:"3576f003",35216:"5334bf47",35387:"907c8c6a",35466:"1cf42300",35577:"09e24d74",35614:"032d72a0",35647:"a2e876c5",35768:"90bfd346",35809:"c30c381e",35874:"df463adb",35879:"41f4b8cc",36009:"b3a22aab",36312:"ade0010f",36495:"b3fdbb6a",36511:"f3d03ec8",36673:"8d4185e0",36773:"91647079",36933:"eb87086a",36935:"63849fd3",36983:"7c43c98e",37021:"6d92a4b5",37055:"ac6b62e9",37058:"6e357be7",37208:"c3a94ed1",37257:"4f4166ed",37316:"229edc10",37362:"6dfd1bfa",37426:"3d7b9a1b",37894:"7779798d",37977:"febe4bf0",38056:"5bb043f7",38104:"b34a9ee0",38230:"80b5c97d",38333:"8e018081",38368:"11414e0b",38450:"a77f15f9",38469:"66cd2d70",38504:"0df0bc38",38591:"e80537c2",38679:"5eece5ec",38768:"38cd2ebb",38792:"2f6d8a46",38819:"0cb88ec0",38873:"179d37d3",39033:"7aabbdee",39177:"3ae213b8",39209:"f55bfda4",39252:"d179e89e",39275:"c0ba661c",39325:"a072c73d",39368:"b7e5badb",39605:"22f9ccca",39645:"e6e9a3aa",39726:"f2c01e3a",39820:"8277cea1",39853:"73c3a5ed",39941:"f8904416",39972:"fa8dc2e8",39978:"5b34f9ea",40097:"f49b74d5",40158:"2d7caf96",40176:"0260d23f",40342:"7ad00ade",40365:"dd8797f2",40665:"c2ef5f99",40830:"eaaaa138",40930:"51cdab7b",40936:"f5d7fbaf",40986:"0cbb6061",41100:"ba1c1ac8",41329:"9444e723",41388:"7945275b",41537:"e6241e03",41750:"78718572",41840:"5b7c576e",41863:"56181a0b",41954:"85db7b61",41958:"e4176d9e",41998:"81192af7",42051:"a8987ce3",42054:"10c43c6e",42059:"bfe6bb1f",42169:"672c9486",42187:"4b66f540",42226:"3ebe5c8a",42263:"909a3395",42288:"24647619",42289:"48e254a2",42371:"ff4be603",42436:"4447d079",42465:"c14e35a5",42551:"a2ff0b9e",42609:"ff7c02a9",42620:"b6cfa9b7",42690:"4b481283",42721:"8e0282b7",42728:"910f748a",42757:"699b0913",42930:"608d6ba6",43037:"e2e305b4",43047:"ea09532f",43072:"dc98fcfb",43294:"f6d93f4d",43529:"87186dce",43554:"39befbbe",43635:"0c94161c",43645:"d7039a99",43697:"4b718ce0",43793:"0682e49e",43849:"8b15c55c",43919:"bfada16a",43966:"b99800de",44023:"0e0b668d",44029:"4b0528ed",44118:"5d86b3d6",44152:"f193e9f7",44174:"5b1c4ba7",44393:"dfca4314",44523:"3d99ef33",44765:"0a54392a",44797:"7e328509",44860:"55a23a94",44907:"0f014490",45057:"46f76bef",45091:"dacae080",45114:"593ffe68",45279:"01f7e848",45287:"8b1145e2",45571:"a258685b",45583:"3476fe8e",45593:"d02f7bc4",45732:"f2abaee2",45786:"239111c7",45809:"83d061ac",45878:"9ee45729",46023:"5216f17c",46045:"e0eae934",46074:"fff3ab69",46218:"8d0344ba",46284:"a5b5d55c",46328:"2fc02015",46447:"e7478c24",46838:"46dcda29",46901:"33a34e3b",47062:"cbbdf9a2",47068:"ba73f26c",47082:"cc1f5ce8",47117:"8b6445a0",47276:"524b67e3",47287:"cf1567e8",47463:"e5c3dfde",47568:"3059ed75",47582:"9ee4ebe9",47655:"ee799351",47708:"3ab425d2",47838:"2f0ee63c",47975:"497aa321",47986:"9a7b56f5",48031:"38bd3ddb",48150:"6f6b3e89",48218:"c81622cc",48320:"bf0d24cf",48426:"3ddb8349",48840:"abfd17f9",49096:"70f3cfb0",49169:"d1b82434",49241:"f99bfa77",49270:"8eed67ba",49874:"98a6ff5a",50017:"ab2e7268",50052:"8ac39bbe",50145:"4d028f11",50153:"e51da90c",50193:"486e741e",50240:"7b4c719b",50337:"acb04c32",50362:"7ce5ebd9",50375:"e86c0d05",50437:"9f305eae",50472:"40a0c599",50525:"aba6a826",50773:"7bcf009a",50849:"d7e1d518",50999:"6f93a078",51555:"e91074f3",51574:"12e76d03",51593:"6f219482",51605:"ea82a261",51625:"3b5ffa57",51706:"af6e989f",51768:"e6fe050f",51830:"42a4a45b",51840:"b63fdeeb",51945:"86a7da57",52094:"22a76d89",52126:"08ba51c1",52251:"92bceb62",52286:"79bae4c5",52491:"fcb00301",52499:"3b1e54e9",52573:"2006be57",52586:"a18114c4",52593:"28599d52",52715:"c04dcf0d",52789:"0e46f7bf",52870:"f888d9d8",53237:"1df93b7f",53243:"54230287",53371:"c55f973e",53442:"6cd64148",53675:"3ca132b1",53823:"f20f879f",54125:"b684abf7",54133:"6afbfa44",54178:"1632abda",54210:"4c8d1cae",54250:"b3c952b5",54265:"d6f7d5e2",54363:"fa5bdf0c",54382:"4bae0029",54397:"3034400c",54487:"612ebb8a",54513:"cca83a59",54591:"e0668c88",54756:"130a23fd",54778:"fd67079f",54786:"ed07f994",54794:"dd8be3b2",54855:"32f0f819",55043:"a463ff81",55216:"5560d84e",55239:"661e4fa4",55273:"08472b2d",55335:"746f419e",55478:"1710d498",55552:"8e23b856",55693:"7ec28fd9",55726:"7f536709",55745:"2e18dbc8",55821:"676a3180",55925:"407fa3a0",55962:"f2d325f1",56290:"640fe435",56424:"ac4fb807",56513:"e8d36425",56614:"71f8452f",56750:"f1525ef1",56795:"151869e3",56902:"d4a6dda9",57121:"918ae6ff",57126:"7f039048",57242:"522a40f8",57258:"1b4282d0",57293:"fb218ddd",57341:"3ad7b662",57489:"f251ab77",57598:"e4b4615d",57599:"6e586ee3",57699:"d06effa9",57749:"34660ac5",57780:"163044ef",57820:"a3c98c45",58009:"84c320c1",58042:"4893a1cb",58096:"e345afee",58182:"a045168c",58197:"09e9a7df",58234:"6145eda0",58247:"551b313a",58356:"649a76e7",58564:"ea41aad0",58768:"4f9404e5",58818:"cc6053aa",58822:"cf14af90",58824:"bf2622dd",58914:"68709c70",59051:"8938295e",59060:"de11ece8",59181:"010f8398",59191:"af049e12",59241:"07a6f1c2",59248:"8962034b",59336:"902aff6f",59342:"d959d974",59427:"b43aa387",59442:"0cd38f48",59496:"081ed9af",59506:"c2ed794e",59533:"897798e8",59592:"d243562e",59771:"e56a1a2c",59900:"619d2e79",59982:"f929d4df",59992:"918c9b38",6e4:"78f8003c",60185:"7b2e834b",60331:"34d5cc00",60434:"05a720dd",60518:"bb341369",60603:"b8677fbf",60682:"1ebc7fe2",60831:"84960677",60868:"16bb304a",61007:"a79d55be",61200:"4c13f84f",61210:"f8bc4080",61249:"f497508e",61271:"e50573ba",61361:"a529f863",61543:"e31a63b7",61643:"a882bd74",61793:"5e52bbeb",62117:"83a26c48",62235:"912fcb5a",62307:"5c77ea5f",62318:"bdd03912",62319:"8e993d66",62474:"686c1ad3",62493:"6312a106",62523:"54d1c079",62547:"bc1c33e4",62655:"8fca97e0",62867:"34d502be",62948:"c38f23a9",62983:"6dffe7c4",63105:"f29affbe",63387:"877a3c1c",63604:"7bc70741",63663:"cb582f54",63777:"92264b81",63801:"010118f9",63808:"86c8f7cd",64e3:"cacfa11d",64032:"d6990b47",64072:"555f2cec",64100:"e8591f69",64122:"07b92fc6",64485:"e715560c",64525:"6694e7e9",64533:"fd0f74aa",64596:"eee9e2f1",64656:"300bd484",64754:"e74888b9",64766:"5a7e5a43",64824:"dfd588b8",64871:"42d0afac",64900:"610e19f0",64967:"b81f3fb0",65132:"637ec626",65501:"8d3be60d",65592:"ef0f3981",65594:"88dde0bb",65641:"90dccef4",65758:"62041344",65965:"21d2296a",66026:"06b7cd3c",66081:"9d336f66",66187:"cf494ba6",66238:"1fb9ab5c",66256:"5f0246ae",66303:"a5560bad",66336:"87cc8f7c",66462:"1d642165",66465:"9c53d859",66597:"e0052e0c",66958:"12b52520",67010:"49763a0b",67061:"d7124adb",67069:"c7b80b67",67132:"15f4efbb",67343:"2d8700b9",67448:"85ac525a",67583:"18caf9ef",67597:"1693c0b8",67638:"a48eac25",67670:"a23744f9",67908:"d29db0e3",67954:"8d96489a",68034:"8858d0ce",68126:"db9653b1",68258:"f301134a",68689:"8751004c",68757:"5335ef4f",68793:"9b89ba00",68823:"c3ca7a6a",68943:"ae5838f0",69111:"1a54bfd0",69125:"a5f4c814",69209:"193f200e",69234:"212137e2",69254:"140e6a69",69628:"9bfbb8bc",69629:"d5d7628b",69647:"607292ce",69843:"7facae8f",69899:"6d933e1d",70081:"41e02281",70130:"16b47049",70178:"06b3b671",70249:"7bd49e6c",70277:"d72ada40",70367:"1fc4ed50",70504:"f53e2381",70543:"ce79b72a",70614:"65306ecf",70706:"21d3c1c7",70964:"06876062",71081:"99c371aa",71160:"dff7b4e8",71169:"5ed92a05",71287:"167f5be9",71476:"c26ab7d5",71516:"dcf1d6e7",71544:"d93a0aad",71698:"ff9d88b6",71789:"fcfa677e",71811:"39afc900",72070:"59dfcfb5",72189:"7f31124b",72331:"66179fb5",72500:"cf945ce5",72613:"084a18af",72638:"1adeac4a",72740:"150a4d14",72887:"2e2a73ec",72952:"d703ca6f",72957:"d9b3adf3",72978:"2e6d047c",73238:"1eec97be",73300:"4efa0483",73338:"aa02927d",73356:"b26f6fa9",73369:"6594bd70",73452:"63b8176f",73537:"c7953305",73618:"12dcfbad",73725:"8f9c5733",73745:"154cbeb4",73766:"af29c71b",73975:"566ea6d5",74019:"ff35d8ff",74061:"a0541488",74091:"67e63bc0",74132:"ba454016",74136:"ef4e0f5d",74139:"38341509",74332:"769f97b7",74362:"71078103",74441:"4c8fc79c",74465:"e6a17fa0",74480:"4bbc58d4",74578:"80ea5ae7",74794:"016f0e16",74839:"8b87f6f5",75070:"5b34d9eb",75118:"29ff1658",75123:"8d81369e",75203:"61c61e17",75273:"ca1d44bc",75546:"c176dc4f",75567:"43a232e9",75671:"ff078e30",75702:"1ac49947",75897:"64e30bbc",75911:"b92bff04",76029:"d6f3938e",76180:"172c9869",76222:"99ae7254",76240:"02de7b5a",76355:"2363ed29",76360:"4f8fd4be",76369:"c0f7075f",76527:"359e34b0",76793:"d12dbf4d",76878:"198182f0",76895:"8d493a07",76924:"cb870251",77053:"9eb4c1b4",77170:"60e8b504",77413:"4deae4de",77427:"78dc40c2",77527:"56c932ee",77655:"274eaedf",77680:"60043c0d",77887:"be698a2c",77923:"971cbe2f",78035:"1ae50e88",78063:"c80936bd",78072:"ed51eb7d",78114:"516dec85",78233:"41742cda",78248:"df22f3af",78452:"118229e6",78863:"2b5e4b34",78924:"0fcbeed9",78991:"69f3d9b5",79073:"a17fb62b",79164:"95f18dd4",79290:"f83967c4",79298:"10c28d6f",79356:"73a7bd5f",79478:"df5a3016",79479:"27e1a14b",79543:"fbff3b11",79691:"c733e485",79829:"6b2b8280",79895:"4bedd8c5",79958:"b00a2879",79963:"092519d2",80053:"935f2afb",80132:"18dd253f",80192:"fc8aebe3",80268:"eac8f2ef",80380:"2d35b91c",80709:"83cd8f20",80895:"f97cc188",80940:"abf6a1f1",81135:"0c1ee94a",81477:"2dd65ece",81667:"a39041db",81708:"82a4f002",81835:"6a0b4355",81934:"35fa8025",82132:"919b108c",82250:"51da09c7",82342:"dc130668",82348:"3f6554cb",82360:"d885d629",82423:"fadcaea6",82513:"7bcf7096",82614:"aa395a59",82621:"ba4efbe0",82864:"3ebee193",82982:"bf02c3ce",82989:"39b565ff",83069:"39c8ecdc",83074:"1a42aba3",83147:"a26f7afa",83692:"779753bc",83893:"66716ec1",83957:"10f908b7",84063:"363318d5",84097:"737371dd",84242:"6e366b57",84362:"8da7304d",84477:"3b12bc8a",84500:"6eeb04e2",84513:"6cb122e3",84710:"4cec253a",84745:"42325f5c",84754:"211f58b1",84841:"7e5ee96c",84847:"5c27dd68",84854:"34b19815",84888:"e9d5739e",84941:"21bf64ca",85011:"6e5d074b",85054:"7bc3feb7",85098:"60d04b47",85217:"f8482b2c",85419:"3743f01c",85455:"7e446cc1",85493:"6d480200",85780:"82033eb7",86009:"c9b79676",86018:"ed809cac",86129:"b47406fa",86150:"1db21d86",86333:"08e3aaa9",86356:"4ad39569",86476:"45aa7127",86518:"ce66b6fd",86826:"7dd3be25",86950:"4e1da517",87064:"a6a8af40",87223:"27bd5328",87224:"6fc8d865",87240:"1fdab62e",87304:"9980f90c",87313:"96c0bb00",87316:"a48778d9",87443:"96ec050b",87460:"7668acae",87482:"8faa0fb1",87513:"a291f403",87634:"f807eec9",87667:"c07f2717",87799:"3958a146",87836:"e111f111",87866:"b1998bb1",88179:"e5842021",88187:"1e391540",88252:"c0074ddd",88295:"21ad5224",88338:"e1b9986a",88380:"ac930f6e",88446:"3c725018",88598:"6827856d",88621:"4499569c",88625:"6f59957c",88821:"41db9914",88831:"1c56d006",88879:"32ea4ecb",89002:"4455e85b",89210:"b4028749",89574:"13b69fa8",89780:"46c600d5",89806:"443045da",89852:"1f79049f",89986:"fee1f25c",89987:"d25ffd5f",90046:"d043cc46",90185:"41b3e733",90333:"f2497893",90342:"2c91f584",90392:"f60e43ec",90398:"0b78393d",90431:"dd313590",90451:"0a13c98e",90464:"b2335bc1",90536:"73dfc993",90560:"01fb8b11",90601:"22f40a40",90610:"147b0f6a",90615:"8cd0f4f5",90645:"6601f604",90666:"459a783a",90865:"87e7806e",90896:"4302562a",90976:"6eb0ce42",91178:"9c4bbfc4",91213:"ff0539a2",91231:"28b27838",91274:"76ace0dc",91287:"02ee0502",91304:"872e63de",91406:"4cd7d8af",91425:"1671b3fa",91523:"d692bb25",91628:"c839a5b0",91753:"2f535455",91782:"304ed800",91849:"b2735041",92085:"000c061a",92244:"8c828746",92269:"9c42de85",92393:"db5c8692",92404:"799b872c",92456:"d6360c39",92463:"4d4093bb",92744:"6eebf72d",92775:"7861f6df",92778:"8c31caf6",92786:"ae5bb339",92843:"85c3ba36",92851:"8bfba65b",92964:"14e00221",93023:"b984322c",93071:"61e5c5b8",93151:"e7cbe8da",93176:"f7101d4f",93195:"740eb29c",93308:"b83df1bc",93340:"5d075efb",93346:"f7735fb0",93377:"dd435828",93400:"03e8549c",93590:"dede40b0",93749:"4e6907d6",93832:"917734f8",93837:"cb341380",94114:"c9aea766",94123:"7c8407dd",94136:"91dc98f0",94197:"37aba5d3",94223:"43b891d1",94328:"63f66cb7",94337:"9fdf7324",94401:"6c10648f",94452:"878356ab",94605:"487f7f30",94694:"d3e690ce",94696:"376d31f7",94932:"a233fb97",95020:"b8e39b95",95107:"d666ab7e",95171:"3db8c88b",95281:"bc08bf79",95296:"9936b6c8",95317:"cf282674",95327:"1e173bbe",95329:"5b23c695",95364:"41fbfe2f",95418:"7877b0eb",95441:"e9ef6b31",95561:"0e0f5dd2",95696:"8462ad7a",95745:"edf19300",95801:"e490fd18",95911:"7e254f9d",95945:"90b0cf6d",96055:"8fa500ae",96078:"d6011437",96082:"a322018d",96135:"3061ad92",96188:"f0129862",96361:"ebf2bdda",96426:"64bd79cb",96535:"38e65fdd",96544:"49ea6ca5",96547:"385bc71d",96617:"e23cd647",96684:"a612420b",96772:"b35418cf",96831:"99ba663e",96945:"09e11ac0",96971:"57973c2b",96979:"7f6f8f16",97065:"6816f4c0",97129:"f3034cf4",97334:"9d4bcb9a",97469:"d91e7ab4",97523:"02fbc840",97547:"902fdb3b",97553:"7ea214d5",97617:"ed97cef0",97782:"b094b997",97816:"7513b789",97826:"16cff1eb",97850:"dd6685df",97920:"1a4e3797",97955:"746bf890",98177:"049dc708",98200:"0e7f2915",98218:"1820eb3b",98272:"b7f629d0",98623:"ced65f67",98740:"d1475ab1",98791:"1a6f209f",98868:"6a913ab1",98939:"3ff950a4",99120:"008b0ccc",99184:"8aecb2ef",99266:"ca443c18",99367:"00125b11",99389:"c2f4aca4",99427:"64758f43",99494:"f2d5637b",99607:"49ea4a42",99669:"32db5af4",99839:"15d4dc80",99871:"5e3def70",99997:"b63b5bb9"}[e]||e)+"."+{19:"73207e89",21:"a3309319",71:"148bf97d",99:"3f2b222d",172:"c102a782",224:"487be67f",250:"e9efb2fd",291:"b2f7c218",467:"3701a1f3",513:"531e85fa",596:"7b2cab9f",803:"9d68a6cc",899:"26bbe5e9",1116:"56b8b25b",1365:"573cbaf6",1387:"23980d09",1523:"ec2891cc",1647:"63a2d108",1652:"94113f44",1664:"180295d2",1671:"62a8f5fd",1940:"3261f2e9",2044:"d2700165",2097:"7188f7ec",2196:"3febcd57",2312:"093e5909",2427:"f701c31d",2638:"118a41f9",2656:"37531cd2",2905:"76c11c59",2916:"2ecd567c",2989:"cf806e65",3044:"712f76ea",3102:"bf11a1b5",3145:"b6e9e373",3191:"74dd2862",3197:"9da021ce",3216:"eacb9804",3283:"cd4afe73",3326:"58beb1bf",3397:"b0ae73af",3398:"63275511",3650:"2e235066",3667:"1f8a4cb5",3914:"54e8dd0f",3919:"41dcf0b8",4125:"96f2442f",4151:"1a6afc47",4195:"7402cd75",4244:"8675cdba",4328:"e0ebdf09",4504:"025ef68d",4513:"558b14b1",4585:"05c4d67b",4631:"1d37eb2a",4874:"39750b03",4882:"d87f6d94",4929:"18e132f8",4972:"a7243668",5061:"509f2053",5129:"479f5110",5313:"5f8b4c43",5352:"578d6913",5383:"306e0d1d",5512:"8f6a2d54",5714:"ad1f0a21",5909:"51facc0d",5920:"8c8aae04",5981:"efaedd7a",6027:"6f5c789e",6151:"7a76de9c",6386:"67f2de5f",6443:"14902fad",6517:"34dee69c",6537:"ba99c5c9",6553:"08568a59",6695:"cfee6a07",6734:"6772bb12",6799:"67367b32",6822:"1511eca7",6824:"6e7a03e3",6968:"a44bcc6e",6971:"873ad005",6978:"6c523e96",7078:"521e19d7",7091:"8c3d2fe2",7092:"cc64c0ff",7108:"d17c6119",7120:"d18d3e66",7155:"f52fceb3",7162:"cf567055",7318:"3835ac08",7451:"334acc4c",7485:"7b41d7f5",7500:"099ef3cd",7874:"7c1cf1bb",8023:"897a9b52",8135:"672f0bde",8145:"5d99a1dd",8188:"3b884b4b",8210:"d7b2d51a",8313:"5d208e75",8328:"d30eb04f",8407:"28cf1826",8482:"cceadeb0",8638:"c9ba8e41",8671:"b4177ac9",8809:"959e39c4",8882:"0bf188b0",8906:"cfd44206",9028:"9d1978ac",9119:"fd788116",9225:"a93f8834",9235:"b22f9f6d",9365:"8b245b69",9444:"ca6f47e0",9542:"4f4a1a23",9550:"83ecb96d",9615:"54fc882e",9817:"83bf0cd5",9836:"c72917a2",9907:"676fbeba",9947:"5cfa1c77",10109:"beb060f2",10228:"aa539b8b",10270:"8ae752b8",10436:"1ec22cec",10497:"809aee5f",10650:"547a5e75",10918:"ef85344b",10987:"c12ef65c",11174:"2c33f6da",11203:"9539d62e",11311:"f40cd0a5",11321:"cd4efea5",11326:"6cbc304c",11342:"2bccdb3b",11398:"f8e68ae8",11656:"ac13cb1c",11875:"d1539320",12228:"d521a8eb",12442:"75a11235",12549:"16f88776",12555:"900d6f87",12558:"0ac14857",12560:"9bb2eb9d",12567:"7d910920",13253:"0ef1443d",13280:"29f73853",13351:"804b2952",13460:"9e91d59d",13588:"d0b6d6aa",13595:"6c1f4a63",13617:"ea5a2c00",13718:"94e8a0fc",13896:"76d63f5f",13924:"ba0bd4a2",13979:"aad133c6",13995:"243e677b",14061:"6004bdb7",14088:"6e4abf52",14095:"18927dce",14143:"70c2608c",14299:"f4efc87a",14369:"b25e3d4c",14386:"acd52792",14396:"fd9bfcdc",14549:"607a549f",14610:"a9ad2a64",14670:"63c0f96a",14713:"8c81902c",14840:"4ae7e4f2",14908:"954b802a",15196:"9dd233ed",15497:"2c78c17f",15658:"e22235f9",15888:"50a93f8c",15970:"360efba8",15994:"9794e987",16022:"c86edca7",16038:"8f228690",16058:"f0b25bfc",16071:"0df54331",16153:"4800d10d",16161:"3a69d696",16379:"38089655",16528:"f38ed133",16635:"e969de7c",16672:"38ba7418",16685:"3022befc",16876:"19436136",16973:"9fbbd5c9",17275:"b3f920a9",17283:"b641bac3",17457:"ab4ccae6",17511:"fdf99bc4",17726:"fc4837d4",17757:"10a7c58d",17785:"172166c7",17883:"3f659ac3",17887:"fbdcba1d",17989:"52e3fc0e",18025:"8e9620ce",18050:"5b8280aa",18084:"fcd7fdb2",18100:"0a6d52c3",18143:"b09cd8a9",18156:"a975996f",18186:"df8b47fc",18318:"edf202fa",18559:"5c93aa35",18734:"5dd15d0b",18746:"f9ac8609",18883:"90cc608f",18892:"6c8911a8",18894:"74b1ce85",18928:"0701e03d",18998:"d6cefe2f",19177:"f4fb3a86",19204:"25b579ad",19212:"c6205a58",19305:"bf07439c",19408:"893bf9b0",19427:"13389c1c",19493:"0990c5c4",19504:"3cbf15b2",19531:"795dc04c",19625:"acfca89a",19671:"e0c673af",19709:"eaec2d23",19733:"3acc99c9",19806:"837a7ae1",19832:"36958835",19876:"a944f10f",19939:"123b41cb",19962:"a3ecf956",20040:"1320b0ce",20061:"fdab8ea6",20169:"f30c5c13",20303:"ac64adc9",20602:"d2e8db2d",20689:"f0ff8154",20707:"9011dfb7",20764:"705b6a69",20911:"eb39e4b7",20917:"9c9a3e5c",20983:"f78047ac",21015:"1e986630",21134:"a1896a0f",21143:"16349e64",21190:"580de28d",21207:"8e9578f8",21228:"ad3ad6e5",21379:"80aa9c55",21643:"eb35f457",21688:"795caae7",21823:"35e78f99",21983:"a4e4572b",22030:"c38a2655",22087:"817ffdcc",22129:"267916df",22163:"f5657f46",22238:"bdfbafdb",22456:"62957769",22523:"15391619",22540:"57cb9539",22583:"b8adcfe8",22604:"4c410f1b",22777:"991b45b1",22898:"c8aecb21",22940:"873908b0",22997:"f3a4a591",23064:"03b7ec0b",23228:"c0599384",23231:"295984d8",23252:"94c1e97b",23310:"c407c53a",23320:"3c9b69f0",23343:"08e5a4d6",23435:"59082b53",23522:"dcfb4085",23536:"2a58bbac",23545:"4623b3a1",23663:"33ee14f2",23714:"43716ebe",23804:"66d68fe3",23898:"ec519008",24058:"07462b4e",24066:"9d4d9ce3",24101:"d3e3013e",24109:"795d5349",24158:"a91f4481",24266:"0b540723",24282:"d3ef7720",24401:"1ae158f2",24467:"dc4c3279",24501:"178a453f",24946:"8d83115f",24986:"ba127d73",25079:"1dab7340",25251:"3cee59a7",25283:"06e3d89c",25427:"854a38e7",25433:"c48bf181",25451:"a23a897f",25513:"3f77f081",25547:"284c9b9e",25579:"9e7055ec",25833:"71a40566",25898:"94b4215a",26067:"7b3970ce",26084:"861dcdd5",26086:"12738d95",26201:"75d8825c",26291:"bd84899d",26311:"ce5c5ebb",26521:"89b58f07",26654:"4d65993e",26686:"9da74581",26695:"49306f14",26858:"1ce10981",27109:"fbcb735e",27167:"eb8d133f",27270:"02e3dd33",27276:"2fd74dbb",27303:"46258253",27324:"04a7ca69",27554:"c429cc73",27704:"849e54d9",27918:"ca462563",27982:"0533c8f0",28085:"d5cffe43",28134:"3e2ffbbe",28139:"80df3532",28219:"51e8e461",28261:"f279f4e5",28367:"dc6ae3d7",28475:"2000a841",28476:"892e8462",28514:"2d31535a",28516:"d3f4d479",28623:"760e1770",28699:"c9753e68",28800:"7ebb42b4",28880:"777c9b40",28882:"79f11e9e",28906:"63cdbd64",28922:"fa03019f",29014:"63a3f3cc",29025:"caedbded",29050:"15e17037",29066:"92b199e5",29131:"61e3e7a5",29191:"6765a974",29272:"a7da1cef",29514:"902f2c64",29520:"59707016",29698:"4ac96687",29717:"c3facf77",29782:"8ca83049",29818:"be78b6d0",29831:"c421c31a",29864:"7e0679a3",29871:"8a4a1409",29886:"da3cf2c4",29899:"8cb1ad4a",29978:"f29be154",29980:"15805725",30062:"2dbf55d1",30216:"c844cada",30295:"54944412",30419:"be694780",30433:"db6c199c",30454:"5264f7a4",30470:"0ae2450e",30589:"3a397208",30677:"b72f0627",30678:"2a52c41d",30800:"9375989b",30820:"b28f98b9",30834:"ed2abcff",30837:"8f11c961",30865:"8b9d510d",30885:"e871b509",30979:"589a7d3a",31009:"41ecc7f9",31013:"7a5f9581",31023:"970d7bca",31044:"732db84c",31050:"400af1a1",31068:"1f0b2373",31089:"ec193a0e",31116:"bc1bd6c9",31152:"a086d363",31187:"52f3a337",31293:"17ff3286",31294:"3c2a361c",31441:"c659a961",31471:"d9c87e09",31512:"69ffbcf7",31516:"6f3edbc7",31570:"a9bef6dd",31671:"4d0bd185",31824:"22f60a1f",32224:"19c9db8f",32319:"63655ae9",32410:"f2893ec5",32446:"8ce02657",32491:"bdd9498f",32567:"277cb195",32652:"72910332",32689:"08107660",32839:"5c79dc2a",32872:"855c14fa",32892:"250cb5b2",32914:"70e38801",33023:"2964b050",33076:"ef5beb95",33083:"16e34a1a",33131:"a67a7052",33138:"cbc82116",33178:"555e80f9",33181:"2310c781",33223:"e5f8838d",33260:"5ee9d0c7",33261:"a2e46c4f",33329:"94ab58ef",33407:"3ff66407",33725:"6470c099",33889:"1f400f9d",33920:"32836d68",34020:"309d55c2",34077:"652a00df",34079:"25c81e5a",34153:"f309a901",34206:"8ef010f8",34293:"a7e2d1af",34294:"6e8ac714",34323:"ef923fe8",34407:"f08cd09f",34458:"bedb0cad",34460:"51defee4",34475:"994398a4",34552:"278830b5",34590:"e79f245c",34647:"cb920ca6",34656:"53a0d9e7",34748:"c74caba2",34766:"9716c156",34784:"121bb89d",34792:"13e2220d",34800:"31234350",34882:"53c961aa",34943:"5a2f2d6e",34979:"be9c4116",35038:"29817a1c",35069:"05c8b29d",35214:"3be2021d",35216:"783d15db",35387:"d6c4b7cd",35466:"b8c66a97",35577:"eef0d34f",35614:"0550e592",35647:"8df42e8b",35768:"fa150a9f",35809:"29c1c1a6",35874:"64598624",35879:"603ed18f",36009:"ed99bcf4",36312:"f6211aac",36495:"c4a16cab",36511:"aa66c640",36673:"9639c6ef",36773:"364a602a",36933:"6c6d7692",36935:"232400dd",36983:"87a7744c",37021:"8953fbe7",37055:"8a714c7c",37058:"583e5f2f",37208:"4babdc40",37257:"7b25eb85",37316:"136d87ad",37362:"af7565f6",37426:"a3fce28a",37894:"1d31c5b3",37977:"a605632a",38056:"c9cc2c03",38104:"70b4c07e",38230:"14de8f42",38333:"6d30319e",38368:"99e33615",38450:"971f211e",38469:"c905d16c",38504:"b575cc51",38591:"8b436f7f",38679:"2924f701",38768:"e49628aa",38792:"873b0b4e",38819:"d5786d3c",38873:"aa2dff10",39033:"72f28f6d",39177:"e72fee4a",39209:"ab100076",39252:"a8e9c58b",39275:"68258924",39325:"9e574bbc",39368:"9b3b00b6",39605:"87f4261f",39645:"363e983b",39726:"e601e6d1",39820:"efc15edf",39853:"2eceed3b",39941:"09b3269e",39972:"1af66b9e",39978:"3914825c",40097:"065399f9",40158:"c5447ab8",40176:"a0efce43",40342:"6b02e5f3",40365:"943c9bb3",40665:"43c72f99",40830:"28cfd22e",40930:"2bddabd7",40936:"6f445b64",40986:"f6358136",41100:"59aeb5f1",41329:"327337f0",41388:"15946aa8",41537:"507f5136",41750:"dfbc322c",41840:"319bb3a8",41863:"397dd98b",41954:"7942c49d",41958:"83b83b97",41998:"a08698a0",42051:"e3431432",42054:"9a68e985",42059:"dee04e82",42169:"d7053385",42187:"2cada35c",42226:"85755b59",42263:"1b5d9df4",42288:"5b62f81a",42289:"0c3f570f",42371:"d7fc9caa",42436:"1827f217",42465:"cf7195c1",42551:"e6eb7da2",42609:"0fc5596c",42620:"f4c7af3a",42690:"c8225ded",42721:"697fbb16",42728:"2d6aacf6",42757:"a4a33845",42930:"4a3d4ba3",43037:"f8316728",43047:"4998e689",43072:"2e96fcd7",43294:"0b488eb3",43529:"f1d99b35",43554:"9b15f63b",43635:"382a5fae",43645:"e2041df8",43697:"5f5f48af",43793:"b242fd59",43849:"3d340240",43919:"60913365",43966:"f807cefc",44023:"d8d2c9f3",44029:"a29133e6",44118:"1bc0c1f6",44152:"93b2a9bb",44174:"5bbd8c7c",44393:"619828bb",44523:"c96ddcbc",44765:"404d2c80",44797:"bc343266",44860:"83bf1478",44907:"b7e881e3",45057:"026ec45c",45091:"f47baa55",45114:"c5711e84",45279:"b7f604ea",45287:"7ea5080a",45571:"b3f438b3",45583:"c9734003",45593:"fa50b001",45732:"646f2c6f",45786:"02f6764b",45809:"a28b7ade",45878:"2405e6ca",46023:"02d71bae",46045:"7c8d179b",46074:"85dc8255",46218:"304359f2",46284:"35783c6a",46328:"e6ad5407",46447:"4229ab59",46838:"615dfccd",46901:"73c5e427",46945:"aca29914",47062:"ecb4b558",47068:"e5db7558",47082:"87413ddb",47117:"925c7a4a",47276:"414f3077",47287:"17c798ef",47463:"ffbf05d7",47568:"2216e773",47582:"98df77d3",47655:"741c773e",47708:"28dd4c52",47838:"1eda5c96",47975:"1d9b9deb",47986:"768f43ef",48031:"62017355",48150:"e745121e",48218:"32acfb7b",48320:"0781165c",48426:"e645d804",48840:"9394dd4f",49096:"bf964682",49169:"7d6d8f24",49241:"a8e10b10",49270:"cdfcac6d",49874:"1c08aa98",50017:"8cb0b147",50052:"32a7f158",50145:"cf12aec2",50153:"9c0d9698",50193:"cd9074e9",50240:"e5ce9a3f",50337:"e1095412",50362:"1653c490",50375:"e03acd0c",50437:"afd1bfa0",50472:"8d82b204",50525:"8ffca8d0",50773:"93035d26",50849:"afc6073d",50999:"7814621e",51555:"8c0ea918",51574:"e553142d",51593:"ccb11905",51605:"0604c821",51625:"9af5a24d",51706:"5c9852e1",51768:"4856aecb",51830:"3c636cdb",51840:"6e2f046c",51945:"084ffea8",52094:"b014c67f",52126:"5ffae957",52251:"71f8dc72",52286:"dfe5dadf",52491:"3832c13f",52499:"3e054d23",52573:"006068eb",52586:"fc99d991",52593:"93febf0e",52715:"246739de",52789:"4e836d21",52870:"58b181cd",53237:"c3a4514f",53243:"5c3c3aa5",53371:"b01e4c10",53442:"ddb338e3",53675:"bd70cb8a",53823:"6530507c",54125:"2f1c7fe0",54133:"45c8fdf2",54178:"89e6c31a",54210:"0ff7d73b",54250:"f3a66d07",54265:"4eb7ccff",54363:"f05db08e",54382:"226534bb",54397:"95ca43ad",54487:"96d9e1eb",54513:"952be9ba",54591:"4c7ab366",54756:"354bdb25",54778:"ab81ffc8",54786:"ad58d36f",54794:"daac75f3",54855:"4f0b894e",55043:"923df72d",55216:"dff3209e",55239:"52636bf0",55273:"27e02213",55335:"40de3b68",55478:"003e195a",55552:"c5b6f07a",55693:"a959df65",55726:"8042dc4a",55745:"2cebe784",55821:"6a3913bc",55925:"5005937c",55962:"40e7c1dc",56290:"d30a97c4",56424:"1a06e672",56513:"5f155b72",56614:"f2830c22",56750:"186f7120",56795:"017a94ef",56902:"f91db1cd",57121:"e7f833f5",57126:"704067cd",57242:"c1396e1a",57258:"3aaaa2c9",57293:"3721a264",57341:"2aa6199b",57489:"3d3e42fc",57598:"7a15184a",57599:"a8e91558",57699:"77e23c32",57749:"aa83a7f5",57780:"44a93095",57820:"d9fe0245",58009:"c5f9c347",58042:"9798f0fe",58096:"637b08bd",58182:"b13442de",58197:"1d43d81a",58234:"d3624c41",58247:"47cd75a6",58356:"1aea63a9",58564:"fc408722",58768:"41323c68",58818:"9671e908",58822:"8f58955f",58824:"3ea8fad3",58914:"796c938e",59051:"ce30c81d",59060:"f4bb9845",59181:"d2bbdf03",59191:"4f95915c",59241:"126c9fc5",59248:"8c5d9aae",59336:"1face31a",59342:"2c3b6d28",59427:"9a0b349f",59442:"d5c2c74b",59496:"0c397f5a",59506:"4a81177d",59533:"915071cd",59592:"b7b4e63c",59771:"e2a99bf6",59900:"1e9304e5",59982:"c299fce0",59992:"70d0be55",6e4:"cc9f4527",60185:"babf0d59",60331:"924ba5f7",60434:"52042dd4",60518:"c0f1e5e8",60603:"5316c07a",60682:"7854ea33",60831:"08063f83",60868:"267fe23b",61007:"dcc4e5af",61200:"6c5f5e8b",61210:"d73ed0a2",61249:"09ef4526",61271:"fdc83c0b",61361:"636b72a8",61426:"be2e971d",61543:"8f27603a",61643:"f9dbfec5",61793:"01718e8f",62117:"a30c0b4b",62235:"58285753",62307:"5a3ba620",62318:"9a886142",62319:"b75eb48b",62474:"dc194af4",62493:"bbbbd1d9",62523:"6baf1734",62547:"fc411513",62655:"f09d9004",62867:"46a98c28",62948:"dbf70f88",62983:"61d5d965",63105:"096987cc",63387:"02f1203e",63604:"66965e35",63663:"c51c3a56",63777:"0219cda6",63801:"70a50fe6",63808:"379d8af0",64e3:"c520b0e8",64032:"b5c596b3",64072:"f3f20053",64100:"7d16a1fd",64122:"3f535a37",64485:"772190e7",64525:"fde13b5a",64533:"dbde94ef",64596:"7e071618",64656:"5c661dee",64754:"1962a975",64766:"9feb1ebe",64824:"68d4a423",64871:"0c878ea9",64900:"9c15d13e",64967:"8c69c566",65132:"c728a280",65501:"2405319f",65592:"a5b39398",65594:"30989c6f",65641:"3548b448",65758:"5f737d83",65965:"fca751f4",66026:"e4206534",66081:"0b35a4d3",66187:"6bc78a22",66238:"c7bfdb48",66256:"441bc284",66303:"7097325b",66336:"7ee25827",66462:"933f4a86",66465:"711a6a15",66597:"abbb3946",66958:"d3834829",67010:"186c6180",67061:"94f1459a",67069:"a14d8963",67132:"34a01d6f",67343:"0929d0ad",67448:"c3192226",67583:"7dea7c29",67597:"c925248d",67638:"fb736299",67670:"cb8c9278",67908:"7befb180",67954:"4b4df4be",68034:"673d4259",68126:"ab783f9c",68258:"fa76b63f",68689:"3a807ecc",68757:"4b7d9782",68793:"af3c15ab",68823:"5c7b8d2a",68943:"f7a596db",69111:"6f76f4e0",69125:"6052e105",69209:"9e539e38",69234:"6337130f",69254:"270b7bf2",69628:"7988be09",69629:"6d3ab36e",69647:"c5c936f9",69843:"a06150bd",69899:"576d10d9",70081:"47ddcfbd",70130:"be0e7aec",70178:"81e6c33b",70249:"b993646f",70277:"0316f6cf",70367:"1d4d5424",70504:"28a803b5",70543:"fe6b6e6e",70614:"4f9da953",70706:"73960b42",70964:"bc48267c",71081:"31a6d953",71160:"4dddf998",71169:"a0975002",71287:"885b618b",71476:"0485f0c1",71516:"b09b1ddd",71544:"3855ce91",71698:"bc2f9371",71789:"59ccffdc",71811:"264139bc",72070:"7ed11917",72189:"5bbcc325",72331:"6d29448b",72500:"f64f66be",72613:"c769eb3e",72638:"fdb543fa",72740:"556dfa23",72887:"4cdbf544",72952:"e3a8eab8",72957:"960770e3",72978:"aa7363f2",73238:"d2a2d1e9",73300:"e20e9d0a",73338:"04155867",73356:"36d8b7e0",73369:"f8653575",73452:"ed4f95c5",73537:"3e09c354",73618:"f611989b",73725:"5a69a073",73745:"593cd9c9",73766:"8ddc1af8",73975:"a1d7f56e",74019:"eab93d36",74061:"47eacce0",74091:"b78b34ed",74132:"a5970e5e",74136:"f2226b77",74139:"0bcd285c",74332:"0f66d626",74362:"ffd5aac5",74441:"0ef242c1",74465:"3b3b7dbd",74480:"25f1369f",74578:"178c32cd",74794:"8f102688",74839:"577811de",75070:"22d722a3",75118:"bc928be3",75123:"8b8b6076",75203:"8b7d980c",75273:"b16cff36",75546:"49f94163",75567:"355318aa",75671:"5943eb1e",75702:"dfcf6ff3",75897:"b7955be2",75911:"02fe2b4d",76029:"a35fff25",76180:"059fca9c",76222:"22652783",76240:"6c6980d9",76355:"b62d2caf",76360:"8393a254",76369:"9e396c17",76527:"50bbd3b6",76793:"a673c81a",76878:"575a3510",76895:"57085844",76924:"bc5962fb",77053:"16eba6b8",77170:"e863ae3b",77413:"7e5de04f",77427:"add12e61",77527:"78b8c9e0",77655:"0003883b",77680:"595d9465",77887:"088729d7",77923:"fb735e37",78035:"4ead3dce",78063:"5324c9d3",78072:"d4105f8b",78114:"34cd7f60",78233:"2e83e610",78248:"9c0f4314",78452:"8e3ff138",78863:"c85a658c",78924:"f65e527e",78991:"52cf8db2",79073:"9b6a9356",79164:"2bf3dad1",79290:"0c519db4",79298:"4a5cabcd",79356:"37e00d95",79478:"36233afd",79479:"24435308",79543:"a1c76445",79691:"abf6490b",79829:"d0e80522",79895:"85d250b0",79958:"aeb03179",79963:"420ea460",80053:"fe5e4da9",80132:"2e07a3f4",80192:"44c46920",80268:"729162ff",80380:"8a522d91",80709:"fd31ec19",80895:"2b1b02ce",80940:"34e77834",81135:"a0f32953",81477:"81ce2eb6",81667:"b8b8cdb0",81708:"4d7d7762",81835:"efcb7b56",81934:"33bbfee1",82132:"14c8bdbb",82250:"ca3680fe",82342:"74cd685f",82348:"712fccce",82360:"143d911e",82423:"7b71feb1",82513:"4f3b2b51",82614:"2a95d08e",82621:"dfcb91e5",82864:"32d19c2f",82982:"a2616da8",82989:"6a4597e6",83069:"47b3ad61",83074:"c97db8cb",83147:"888c5d69",83692:"db7a78e4",83893:"3d2b86a2",83957:"1b9c82e0",84063:"1a3a757d",84097:"6ef8d9da",84242:"c9097d63",84362:"c6417e92",84477:"d8bfc341",84500:"431c6477",84513:"b6949bb2",84710:"de128b25",84745:"b52aeec7",84754:"dee9fa1c",84841:"93e39080",84847:"215dbaa3",84854:"84dae178",84888:"9d831185",84941:"b4c8a6cc",85011:"2f831042",85054:"adf0601d",85098:"279d70d3",85217:"f61e93e0",85419:"46bf7559",85455:"83eea1ee",85493:"119281a5",85780:"b9f5f272",86009:"70baefe3",86018:"31bdd1d7",86129:"a2e712a4",86150:"cc7f840f",86333:"ea59a642",86356:"dde2e98e",86476:"197674e4",86518:"9284ae71",86826:"df69b0fb",86950:"e65ab699",87064:"3cca1274",87223:"deedbe22",87224:"f3e47b26",87240:"58b186a9",87304:"886ecc13",87313:"c8286e0c",87316:"b4e193d1",87443:"9aa85498",87460:"e647bbd4",87482:"df943648",87513:"b2501ac9",87634:"413486e3",87667:"1173ba50",87799:"ce5bac8d",87836:"b8d9a0fe",87866:"7f1c6977",88179:"0286c49a",88187:"138206fb",88252:"6ed6f96c",88295:"58531032",88338:"44d418f3",88380:"8c2767c8",88446:"bedc1525",88598:"44737f48",88621:"66870438",88625:"fd93b3b7",88821:"a3cc23cf",88831:"4d0404d0",88879:"bc39a8d7",89002:"a4594737",89210:"bbe355e5",89574:"a872329f",89780:"cb004132",89806:"8290698b",89852:"0bf03005",89986:"25e5550e",89987:"7ef3f99e",90046:"18bf9d1a",90185:"9e4a08ea",90333:"50f0aaba",90342:"f21131d2",90392:"bfd01257",90398:"52310a46",90431:"af8fe2a5",90451:"9f655912",90464:"b5c9216c",90536:"9d746e67",90560:"490d2235",90601:"e04386b3",90610:"bf6a18e1",90615:"5eb23ee3",90645:"ec412e06",90666:"a4c267df",90865:"988a7db1",90896:"464b25cb",90976:"3c307ae3",91178:"ae626034",91213:"1668605a",91231:"20e516c9",91274:"89828228",91287:"2cac701a",91304:"51bac317",91406:"7f27e2ec",91425:"6224592c",91523:"da34a7fe",91628:"5022e7cf",91753:"c5461936",91782:"d494ecff",91849:"c19b88cf",92085:"da4ef155",92244:"b2497d07",92269:"54c0a36f",92393:"b2bf88aa",92404:"eb13da2d",92456:"90063d24",92463:"3c0a582c",92744:"d29698c0",92775:"6cc03df9",92778:"3df5cdc2",92786:"5e68e413",92843:"ae295bd7",92851:"d6f651a2",92964:"b6045191",93023:"67e73b3f",93071:"060ab57a",93151:"ee800142",93176:"a74fd090",93195:"e70b51ce",93308:"1467e450",93340:"11edc4dc",93346:"1b4782c6",93377:"d70f55df",93400:"5c88336c",93590:"c0ae13c5",93749:"b0c0c0b9",93832:"ee4682e2",93837:"e0b7e94d",94114:"c4a618c6",94123:"1a6f14e8",94136:"acfc65cb",94197:"84ad4950",94223:"0ea8a7ad",94328:"3c453dd8",94337:"b1e4d02c",94401:"310ef608",94452:"890d4cf7",94605:"3a9648f7",94694:"efee88a2",94696:"be43fead",94932:"f4de81f7",95020:"b3c692d6",95107:"84d38a95",95171:"b59fa835",95281:"452f0858",95296:"98493597",95317:"8d86f465",95327:"e8f86b92",95329:"b279cc8c",95364:"31dcafc0",95418:"5d25501d",95441:"589b9d96",95561:"fd1ede1f",95696:"cadd461a",95745:"f8e3a2bf",95801:"0a783e08",95911:"b9556956",95945:"f79f370a",96055:"b6642cba",96078:"531d5024",96082:"b914d28e",96135:"a45b92e2",96188:"6c9f214b",96361:"900f03a7",96426:"d4071849",96535:"d7e854c6",96544:"f92e8b94",96547:"4ffe2ef3",96617:"95351d4b",96684:"5704d749",96772:"c1e0dc45",96831:"c60ddcf0",96945:"98d2ab5d",96971:"ba9e1277",96979:"9303b4cf",97065:"947df423",97129:"00e43607",97334:"f8f4fc7e",97469:"5487219f",97523:"a3b57cd6",97547:"9e232073",97553:"2ab99719",97617:"a15faf99",97782:"c09f3975",97816:"3b68fd86",97826:"d0281c69",97850:"77a305aa",97920:"c063a301",97955:"ef56c17f",98177:"9bb8538b",98200:"d224c5fb",98218:"290f20df",98272:"69ca16b2",98623:"4774311b",98740:"5e28c252",98791:"f37abbd5",98868:"7936d28a",98939:"dbffc577",99120:"6a0751c9",99184:"9282b3d6",99266:"cb2b1bcd",99367:"c363645f",99389:"b6458274",99427:"dbba9055",99494:"f16b2e8c",99607:"c5ea3f3c",99669:"ce4a8b9f",99839:"19eb08d7",99871:"2f5edb35",99997:"590480c1"}[e]+".js",r.miniCssF=e=>{},r.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),r.o=(e,b)=>Object.prototype.hasOwnProperty.call(e,b),c={},a="@cumulus/website:",r.l=(e,b,f,d)=>{if(c[e])c[e].push(b);else{var t,o;if(void 0!==f)for(var n=document.getElementsByTagName("script"),i=0;i{t.onerror=t.onload=null,clearTimeout(s);var a=c[e];if(delete c[e],t.parentNode&&t.parentNode.removeChild(t),a&&a.forEach((e=>e(f))),b)return b(f)},s=setTimeout(l.bind(null,void 0,{type:"timeout",target:t}),12e4);t.onerror=l.bind(null,t.onerror),t.onload=l.bind(null,t.onload),o&&document.head.appendChild(t)}},r.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.p="/cumulus/",r.gca=function(e){return e={17896441:"27918",21996883:"26086",24647619:"42288",26134010:"15994",38341509:"74139",39579801:"6151",46551803:"16685",54230287:"53243",62041344:"65758",62127933:"16038",65360910:"31013",71078103:"74362",78718572:"41750",79617745:"19204",84960677:"60831",91647079:"36773",92307374:"27554",95169675:"21190",99496549:"31152","906e49ec":"19",f5e3827c:"21",ab971afc:"71","49c587c2":"99","21730a31":"172",a93c3367:"224","23d30d6b":"250","5da0ca7c":"291",a5bcb3f1:"467","9216ce7b":"513",b564874a:"596",c63e6bd5:"803","54d8bddc":"899","0109100f":"1116","66ffc608":"1365",be2f7876:"1387",ef01e1dd:"1523","7981506b":"1647","902d2d1d":"1652","9ecb4d01":"1664",eee57cd1:"1671",a971b35f:"1940","0149cacd":"2044","40d51a61":"2097",e1e17943:"2196",ac4bed99:"2312",fa423b6e:"2427","935116ff":"2638","80631bfd":"2656","60b67194":"2905","7bb83d6b":"2916","9ef1e345":"2989","3bedcc76":"3044","7174660f":"3102",d4d22ad8:"3145",d0a0235c:"3191",ec28562d:"3197","92b043a3":"3216","0b092b5c":"3283","5e94ba2e":"3326",b7343c9b:"3397","1e070b7c":"3398","020a22ba":"3650","6c1d24e1":"3667","7b7fec6b":"3914",c81517c7:"3919",feba251b:"4125","5017cef7":"4151","3c93ed7e":"4195","21cfb395":"4244","38680a69":"4328",f28093e3:"4504",ffa15017:"4513",f2dc10f7:"4585","9654b394":"4631",a3db1255:"4874","4482beb5":"4882","75600d79":"4929",e54b1e77:"5061",d87811ce:"5129",aa4fa4fb:"5313","622596e9":"5352",f2a3bf8e:"5383","631dea17":"5512",d16a2606:"5714","66e9ea68":"5909",d613e1f8:"5920","85e709dc":"5981","391378fa":"6027",a8ef1ed2:"6386","5b4a63ac":"6443","31c3e3d7":"6517","0c99e969":"6537",efc338fe:"6553","111e23e1":"6695","85954f48":"6734",d1284c82:"6799",f49551b9:"6822",cc519fb4:"6824","9e6b2559":"6968",f38fa80d:"6971","7d9c461e":"6978",ff96de6e:"7078",bd1a8573:"7091","30a13577":"7092","365726b0":"7108","7e91f3e1":"7120",bb4987bb:"7155","97ce6959":"7162",b7e69c77:"7318",d8c5fc94:"7451","81f033b8":"7485",bd0e022f:"7500","32d13eb8":"7874","43de05a8":"8023","7d280bdc":"8135","93015a15":"8145","39bddd84":"8188",c565b8da:"8210",d163ea32:"8313",fa8af309:"8328",b4473d93:"8407",f983631a:"8482","2b8a5969":"8638","1b93ff3d":"8671",aa01ca6a:"8809","6fdd5bc4":"8882","6d2c1101":"8906",de8a7b18:"9028",debbc0e2:"9119",e5523a26:"9225","407bcc70":"9235","541bc80d":"9365","0ffc31bc":"9444",cf4d312e:"9542","36edbaa2":"9550","9db3bdac":"9615","14eb3368":"9817","13af1bdb":"9836",bfd6b54b:"9907","7097fbbc":"9947","70cd875c":"10109",cdebfca4:"10228",a7da438d:"10270","05fa5837":"10436","6773ef05":"10497",e8b1baf4:"10650",caf7e36c:"10918","26e1978a":"10987",ba17e21b:"11174",c98c0daa:"11203","2913cae6":"11311",a15a0d8e:"11321","65031edd":"11326","4d2bb41f":"11342",ba5e62dd:"11398","885bf670":"11656","3d446fd0":"11875",b63d08bc:"12228","1fb2401b":"12442","31eb4af1":"12549","885da4ef":"12555","58ac1d26":"12558",a6d8b730:"12560","5aabd190":"12567",b48b6b77:"13253",f65f22ef:"13280","6d905346":"13351","70808dd0":"13460","55920b47":"13588","3be6e3bd":"13595",c642f758:"13617","1ac29206":"13718","8bd7a1a3":"13896",bd0b26a5:"13924","911bbfa4":"13979","0b0df062":"13995","3105dae0":"14061","03902e07":"14088","31585cea":"14095",f71ac404:"14143",af9acd56:"14299",c099652b:"14369","83cbebfb":"14386",d3fe7aed:"14396","79db63f1":"14549","40a26966":"14610","9dd89af2":"14670","763f2b13":"14713","39ed38cd":"14840",f6310963:"14908","4338ab08":"15196",f9c66408:"15497",a97b7821:"15658",a466ebec:"15888","271906a0":"15970","21edad34":"16022","891a9b8f":"16058","6ecc8728":"16071","7d0b3c01":"16153",ff3504dd:"16161","66fe7120":"16379",fdbb9241:"16528",a9347149:"16635",e1bbb98e:"16672",df878b79:"16876","251d94d8":"16973",fe34d639:"17275","45e19d44":"17283",f77885f5:"17457","528fc62e":"17511",c003d460:"17726","00c88225":"17757","3e697946":"17785",b530e783:"17883","996d98f3":"17887","9a71d807":"17989","7f9f61f2":"18025",abc9098e:"18050","29dde6c8":"18084",d19aead5:"18100","2730c631":"18143","5bebce7d":"18156","074a0372":"18186","4e07c49f":"18318","18ccacf6":"18559","07645771":"18734","8282a203":"18746","31793acc":"18883","4d58aa3f":"18892","6e11cc87":"18928",bc4716d5:"18998","584b298a":"19177","86c7426e":"19212","8a064b88":"19305",b7ec56b9:"19408","126e88af":"19427","6f49328c":"19493","83fe529b":"19504","974829b4":"19531","84eafbb4":"19625","3729e987":"19671",c5593510:"19709","2edcde3e":"19733",dc7ad1ac:"19806","17f9c41b":"19832","760eed0f":"19876","8781c463":"19939","6bf1075e":"19962",b73dbab9:"20040",ef0f9e32:"20061",c9664647:"20169","9b7bae35":"20303","1d014bb1":"20602",cbbe4dac:"20689","120dd2fd":"20707","6d0dfc8d":"20764",be3ddcfb:"20911",e01a2739:"20917","4d0df69e":"20983","653c19c7":"21015",ae5e6a48:"21134","6a89e0dd":"21143","0e728709":"21207","0feddf78":"21228",e22055a4:"21379",f18d5795:"21643",aa282e34:"21688",c0ef9e49:"21823","7f2bec55":"21983","43a49a39":"22030",af199e5b:"22087","06ceb223":"22129","9bc49845":"22163","1ca03b4b":"22238","0b0f030b":"22456",ef2624d8:"22523","0e8c522c":"22540","66b5d69c":"22583",a8f480dd:"22604","4995f874":"22777","6775be7c":"22898",fe423ebe:"22940","9e91305d":"22997","500c9b63":"23064",ede8882f:"23228","909cadf6":"23231","8113fd14":"23252",c1f9ba1e:"23310","89c49d10":"23320","57b7b037":"23343","23896e06":"23435","2c9f485d":"23522",edbf4496:"23536","2457e7c2":"23545",c7599d12:"23663","332c497c":"23714","99961c3d":"23804","9a02f8a7":"23898",f1d5089f:"24058","92ce2bd2":"24066","15d86f95":"24101","0c48ef63":"24109",b7738a69:"24158",a8565f1f:"24266","395508da":"24282",fbfa5dfc:"24401","7fc9e2ed":"24467",e6de5f28:"24501",fd378320:"24946","7d30361b":"24986","10fd89ee":"25079","6c0ce6d0":"25251","8b2f7dd6":"25283","2b1e7b76":"25427",b9b67b35:"25433","59740e69":"25451",f265d6a5:"25513","06673fe1":"25547","75071a94":"25579","9427c683":"25833",bd61737f:"25898",cc7818bb:"26067","85860bdc":"26084",a958884d:"26201","3bed40a0":"26291",f35b8c8b:"26311",ec205789:"26521","08985b86":"26654","5667bf50":"26686","3be4d1c2":"26695",ceb6bd62:"26858",bb1d1845:"27109","6d03c6cb":"27167",e022cd8b:"27270",ec2b56b1:"27276",ec11103c:"27303","552bb95e":"27324","0f6a2fca":"27704","865c04d0":"27982","56405cb8":"28085","916fb87b":"28134","95771e39":"28139","50e78136":"28219","9ce40ebc":"28261","7f6814ed":"28367",fc338eb2:"28475","2f4d1edb":"28476",db082e36:"28514","8af04d56":"28516","018243f8":"28623","4a0c84c3":"28699",f6ca5dc0:"28800","2f74be58":"28880","8d83f575":"28882","48c7b3a1":"28906","3417a016":"28922",da9049b8:"29014","8f32218b":"29025","44573fa4":"29050",e5977951:"29066",a670ed1c:"29131","6fe0ccd0":"29191",f5da8015:"29272","1be78505":"29514","4ef1f024":"29520","949a554a":"29698",cff5e41a:"29717","16a52e74":"29782","9e530f0a":"29818","3291c538":"29831",b604d5b2:"29864","729f5dd4":"29871","2c86cbaa":"29886","14e9211b":"29899","3d8cf439":"29978",c32e37fe:"29980","26db341a":"30062","04829abe":"30216",ff9e51b7:"30295","26bc6c41":"30419","1dc72111":"30433","019a0579":"30454","73c32a6c":"30470","10b7b761":"30589","4f643cbc":"30677","8dc6ea19":"30678","534db397":"30800","845c1fa7":"30820","8900c226":"30834","6b685afe":"30837","0c9e4d11":"30865","8d5884d6":"30885","683d9354":"30979",c8b95361:"31009",d1036fb2:"31023","9b98b06f":"31044","23c664e3":"31050","928e95c7":"31068","5e56d481":"31089",abe8f5f4:"31116","8793e9e6":"31187",cc976a0e:"31293","212ceae2":"31294","347c8874":"31441","2d7d2510":"31471","8a75859c":"31512",ee2f6eec:"31516","6c6d8053":"31570",ce861b37:"31671","9f850ab3":"31824","87719f86":"32224","570c64c0":"32319","09e7c68c":"32410","0eb0d7dd":"32446","6167ec10":"32491","5d8d28d6":"32567",f8c45ac9:"32652",a5461ca4:"32689","46d1dc13":"32839",f1c17b7f:"32872",e9268009:"32892","0ef4df13":"32914","9fcb81d2":"33023",a9776c25:"33076","95f7392c":"33083",ad516382:"33131",dd0c884c:"33138",cab767d9:"33178",fa17a3e5:"33181","5af48372":"33223","3deda206":"33260","82dec33c":"33261","9ebfae5b":"33329","765a551b":"33407","586fa356":"33725","5b659de8":"33889","1943e34c":"33920","9b00304e":"34020","3db5eb91":"34077","273b8e1f":"34079","4a797306":"34153","23a156eb":"34206",ff318c38:"34293",f8338e5f:"34294",e48c3912:"34323",c4a71dd9:"34407",f0f4a691:"34458","5c8ad115":"34460","5bea2473":"34475","592e779d":"34552",de061f48:"34590",c93364c6:"34647","813ebe83":"34656","71408d45":"34748","116bb944":"34766","243071a0":"34784","5c392fa5":"34792","99a27b29":"34800","16046cb7":"34882","2c06af7c":"34943","9c12417e":"34979",a2bcabb3:"35038",b269633b:"35069","3576f003":"35214","5334bf47":"35216","907c8c6a":"35387","1cf42300":"35466","09e24d74":"35577","032d72a0":"35614",a2e876c5:"35647","90bfd346":"35768",c30c381e:"35809",df463adb:"35874","41f4b8cc":"35879",b3a22aab:"36009",ade0010f:"36312",b3fdbb6a:"36495",f3d03ec8:"36511","8d4185e0":"36673",eb87086a:"36933","63849fd3":"36935","7c43c98e":"36983","6d92a4b5":"37021",ac6b62e9:"37055","6e357be7":"37058",c3a94ed1:"37208","4f4166ed":"37257","229edc10":"37316","6dfd1bfa":"37362","3d7b9a1b":"37426","7779798d":"37894",febe4bf0:"37977","5bb043f7":"38056",b34a9ee0:"38104","80b5c97d":"38230","8e018081":"38333","11414e0b":"38368",a77f15f9:"38450","66cd2d70":"38469","0df0bc38":"38504",e80537c2:"38591","5eece5ec":"38679","38cd2ebb":"38768","2f6d8a46":"38792","0cb88ec0":"38819","179d37d3":"38873","7aabbdee":"39033","3ae213b8":"39177",f55bfda4:"39209",d179e89e:"39252",c0ba661c:"39275",a072c73d:"39325",b7e5badb:"39368","22f9ccca":"39605",e6e9a3aa:"39645",f2c01e3a:"39726","8277cea1":"39820","73c3a5ed":"39853",f8904416:"39941",fa8dc2e8:"39972","5b34f9ea":"39978",f49b74d5:"40097","2d7caf96":"40158","0260d23f":"40176","7ad00ade":"40342",dd8797f2:"40365",c2ef5f99:"40665",eaaaa138:"40830","51cdab7b":"40930",f5d7fbaf:"40936","0cbb6061":"40986",ba1c1ac8:"41100","9444e723":"41329","7945275b":"41388",e6241e03:"41537","5b7c576e":"41840","56181a0b":"41863","85db7b61":"41954",e4176d9e:"41958","81192af7":"41998",a8987ce3:"42051","10c43c6e":"42054",bfe6bb1f:"42059","672c9486":"42169","4b66f540":"42187","3ebe5c8a":"42226","909a3395":"42263","48e254a2":"42289",ff4be603:"42371","4447d079":"42436",c14e35a5:"42465",a2ff0b9e:"42551",ff7c02a9:"42609",b6cfa9b7:"42620","4b481283":"42690","8e0282b7":"42721","910f748a":"42728","699b0913":"42757","608d6ba6":"42930",e2e305b4:"43037",ea09532f:"43047",dc98fcfb:"43072",f6d93f4d:"43294","87186dce":"43529","39befbbe":"43554","0c94161c":"43635",d7039a99:"43645","4b718ce0":"43697","0682e49e":"43793","8b15c55c":"43849",bfada16a:"43919",b99800de:"43966","0e0b668d":"44023","4b0528ed":"44029","5d86b3d6":"44118",f193e9f7:"44152","5b1c4ba7":"44174",dfca4314:"44393","3d99ef33":"44523","0a54392a":"44765","7e328509":"44797","55a23a94":"44860","0f014490":"44907","46f76bef":"45057",dacae080:"45091","593ffe68":"45114","01f7e848":"45279","8b1145e2":"45287",a258685b:"45571","3476fe8e":"45583",d02f7bc4:"45593",f2abaee2:"45732","239111c7":"45786","83d061ac":"45809","9ee45729":"45878","5216f17c":"46023",e0eae934:"46045",fff3ab69:"46074","8d0344ba":"46218",a5b5d55c:"46284","2fc02015":"46328",e7478c24:"46447","46dcda29":"46838","33a34e3b":"46901",cbbdf9a2:"47062",ba73f26c:"47068",cc1f5ce8:"47082","8b6445a0":"47117","524b67e3":"47276",cf1567e8:"47287",e5c3dfde:"47463","3059ed75":"47568","9ee4ebe9":"47582",ee799351:"47655","3ab425d2":"47708","2f0ee63c":"47838","497aa321":"47975","9a7b56f5":"47986","38bd3ddb":"48031","6f6b3e89":"48150",c81622cc:"48218",bf0d24cf:"48320","3ddb8349":"48426",abfd17f9:"48840","70f3cfb0":"49096",d1b82434:"49169",f99bfa77:"49241","8eed67ba":"49270","98a6ff5a":"49874",ab2e7268:"50017","8ac39bbe":"50052","4d028f11":"50145",e51da90c:"50153","486e741e":"50193","7b4c719b":"50240",acb04c32:"50337","7ce5ebd9":"50362",e86c0d05:"50375","9f305eae":"50437","40a0c599":"50472",aba6a826:"50525","7bcf009a":"50773",d7e1d518:"50849","6f93a078":"50999",e91074f3:"51555","12e76d03":"51574","6f219482":"51593",ea82a261:"51605","3b5ffa57":"51625",af6e989f:"51706",e6fe050f:"51768","42a4a45b":"51830",b63fdeeb:"51840","86a7da57":"51945","22a76d89":"52094","08ba51c1":"52126","92bceb62":"52251","79bae4c5":"52286",fcb00301:"52491","3b1e54e9":"52499","2006be57":"52573",a18114c4:"52586","28599d52":"52593",c04dcf0d:"52715","0e46f7bf":"52789",f888d9d8:"52870","1df93b7f":"53237",c55f973e:"53371","6cd64148":"53442","3ca132b1":"53675",f20f879f:"53823",b684abf7:"54125","6afbfa44":"54133","1632abda":"54178","4c8d1cae":"54210",b3c952b5:"54250",d6f7d5e2:"54265",fa5bdf0c:"54363","4bae0029":"54382","3034400c":"54397","612ebb8a":"54487",cca83a59:"54513",e0668c88:"54591","130a23fd":"54756",fd67079f:"54778",ed07f994:"54786",dd8be3b2:"54794","32f0f819":"54855",a463ff81:"55043","5560d84e":"55216","661e4fa4":"55239","08472b2d":"55273","746f419e":"55335","1710d498":"55478","8e23b856":"55552","7ec28fd9":"55693","7f536709":"55726","2e18dbc8":"55745","676a3180":"55821","407fa3a0":"55925",f2d325f1:"55962","640fe435":"56290",ac4fb807:"56424",e8d36425:"56513","71f8452f":"56614",f1525ef1:"56750","151869e3":"56795",d4a6dda9:"56902","918ae6ff":"57121","7f039048":"57126","522a40f8":"57242","1b4282d0":"57258",fb218ddd:"57293","3ad7b662":"57341",f251ab77:"57489",e4b4615d:"57598","6e586ee3":"57599",d06effa9:"57699","34660ac5":"57749","163044ef":"57780",a3c98c45:"57820","84c320c1":"58009","4893a1cb":"58042",e345afee:"58096",a045168c:"58182","09e9a7df":"58197","6145eda0":"58234","551b313a":"58247","649a76e7":"58356",ea41aad0:"58564","4f9404e5":"58768",cc6053aa:"58818",cf14af90:"58822",bf2622dd:"58824","68709c70":"58914","8938295e":"59051",de11ece8:"59060","010f8398":"59181",af049e12:"59191","07a6f1c2":"59241","8962034b":"59248","902aff6f":"59336",d959d974:"59342",b43aa387:"59427","0cd38f48":"59442","081ed9af":"59496",c2ed794e:"59506","897798e8":"59533",d243562e:"59592",e56a1a2c:"59771","619d2e79":"59900",f929d4df:"59982","918c9b38":"59992","78f8003c":"60000","7b2e834b":"60185","34d5cc00":"60331","05a720dd":"60434",bb341369:"60518",b8677fbf:"60603","1ebc7fe2":"60682","16bb304a":"60868",a79d55be:"61007","4c13f84f":"61200",f8bc4080:"61210",f497508e:"61249",e50573ba:"61271",a529f863:"61361",e31a63b7:"61543",a882bd74:"61643","5e52bbeb":"61793","83a26c48":"62117","912fcb5a":"62235","5c77ea5f":"62307",bdd03912:"62318","8e993d66":"62319","686c1ad3":"62474","6312a106":"62493","54d1c079":"62523",bc1c33e4:"62547","8fca97e0":"62655","34d502be":"62867",c38f23a9:"62948","6dffe7c4":"62983",f29affbe:"63105","877a3c1c":"63387","7bc70741":"63604",cb582f54:"63663","92264b81":"63777","010118f9":"63801","86c8f7cd":"63808",cacfa11d:"64000",d6990b47:"64032","555f2cec":"64072",e8591f69:"64100","07b92fc6":"64122",e715560c:"64485","6694e7e9":"64525",fd0f74aa:"64533",eee9e2f1:"64596","300bd484":"64656",e74888b9:"64754","5a7e5a43":"64766",dfd588b8:"64824","42d0afac":"64871","610e19f0":"64900",b81f3fb0:"64967","637ec626":"65132","8d3be60d":"65501",ef0f3981:"65592","88dde0bb":"65594","90dccef4":"65641","21d2296a":"65965","06b7cd3c":"66026","9d336f66":"66081",cf494ba6:"66187","1fb9ab5c":"66238","5f0246ae":"66256",a5560bad:"66303","87cc8f7c":"66336","1d642165":"66462","9c53d859":"66465",e0052e0c:"66597","12b52520":"66958","49763a0b":"67010",d7124adb:"67061",c7b80b67:"67069","15f4efbb":"67132","2d8700b9":"67343","85ac525a":"67448","18caf9ef":"67583","1693c0b8":"67597",a48eac25:"67638",a23744f9:"67670",d29db0e3:"67908","8d96489a":"67954","8858d0ce":"68034",db9653b1:"68126",f301134a:"68258","8751004c":"68689","5335ef4f":"68757","9b89ba00":"68793",c3ca7a6a:"68823",ae5838f0:"68943","1a54bfd0":"69111",a5f4c814:"69125","193f200e":"69209","212137e2":"69234","140e6a69":"69254","9bfbb8bc":"69628",d5d7628b:"69629","607292ce":"69647","7facae8f":"69843","6d933e1d":"69899","41e02281":"70081","16b47049":"70130","06b3b671":"70178","7bd49e6c":"70249",d72ada40:"70277","1fc4ed50":"70367",f53e2381:"70504",ce79b72a:"70543","65306ecf":"70614","21d3c1c7":"70706","06876062":"70964","99c371aa":"71081",dff7b4e8:"71160","5ed92a05":"71169","167f5be9":"71287",c26ab7d5:"71476",dcf1d6e7:"71516",d93a0aad:"71544",ff9d88b6:"71698",fcfa677e:"71789","39afc900":"71811","59dfcfb5":"72070","7f31124b":"72189","66179fb5":"72331",cf945ce5:"72500","084a18af":"72613","1adeac4a":"72638","150a4d14":"72740","2e2a73ec":"72887",d703ca6f:"72952",d9b3adf3:"72957","2e6d047c":"72978","1eec97be":"73238","4efa0483":"73300",aa02927d:"73338",b26f6fa9:"73356","6594bd70":"73369","63b8176f":"73452",c7953305:"73537","12dcfbad":"73618","8f9c5733":"73725","154cbeb4":"73745",af29c71b:"73766","566ea6d5":"73975",ff35d8ff:"74019",a0541488:"74061","67e63bc0":"74091",ba454016:"74132",ef4e0f5d:"74136","769f97b7":"74332","4c8fc79c":"74441",e6a17fa0:"74465","4bbc58d4":"74480","80ea5ae7":"74578","016f0e16":"74794","8b87f6f5":"74839","5b34d9eb":"75070","29ff1658":"75118","8d81369e":"75123","61c61e17":"75203",ca1d44bc:"75273",c176dc4f:"75546","43a232e9":"75567",ff078e30:"75671","1ac49947":"75702","64e30bbc":"75897",b92bff04:"75911",d6f3938e:"76029","172c9869":"76180","99ae7254":"76222","02de7b5a":"76240","2363ed29":"76355","4f8fd4be":"76360",c0f7075f:"76369","359e34b0":"76527",d12dbf4d:"76793","198182f0":"76878","8d493a07":"76895",cb870251:"76924","9eb4c1b4":"77053","60e8b504":"77170","4deae4de":"77413","78dc40c2":"77427","56c932ee":"77527","274eaedf":"77655","60043c0d":"77680",be698a2c:"77887","971cbe2f":"77923","1ae50e88":"78035",c80936bd:"78063",ed51eb7d:"78072","516dec85":"78114","41742cda":"78233",df22f3af:"78248","118229e6":"78452","2b5e4b34":"78863","0fcbeed9":"78924","69f3d9b5":"78991",a17fb62b:"79073","95f18dd4":"79164",f83967c4:"79290","10c28d6f":"79298","73a7bd5f":"79356",df5a3016:"79478","27e1a14b":"79479",fbff3b11:"79543",c733e485:"79691","6b2b8280":"79829","4bedd8c5":"79895",b00a2879:"79958","092519d2":"79963","935f2afb":"80053","18dd253f":"80132",fc8aebe3:"80192",eac8f2ef:"80268","2d35b91c":"80380","83cd8f20":"80709",f97cc188:"80895",abf6a1f1:"80940","0c1ee94a":"81135","2dd65ece":"81477",a39041db:"81667","82a4f002":"81708","6a0b4355":"81835","35fa8025":"81934","919b108c":"82132","51da09c7":"82250",dc130668:"82342","3f6554cb":"82348",d885d629:"82360",fadcaea6:"82423","7bcf7096":"82513",aa395a59:"82614",ba4efbe0:"82621","3ebee193":"82864",bf02c3ce:"82982","39b565ff":"82989","39c8ecdc":"83069","1a42aba3":"83074",a26f7afa:"83147","779753bc":"83692","66716ec1":"83893","10f908b7":"83957","363318d5":"84063","737371dd":"84097","6e366b57":"84242","8da7304d":"84362","3b12bc8a":"84477","6eeb04e2":"84500","6cb122e3":"84513","4cec253a":"84710","42325f5c":"84745","211f58b1":"84754","7e5ee96c":"84841","5c27dd68":"84847","34b19815":"84854",e9d5739e:"84888","21bf64ca":"84941","6e5d074b":"85011","7bc3feb7":"85054","60d04b47":"85098",f8482b2c:"85217","3743f01c":"85419","7e446cc1":"85455","6d480200":"85493","82033eb7":"85780",c9b79676:"86009",ed809cac:"86018",b47406fa:"86129","1db21d86":"86150","08e3aaa9":"86333","4ad39569":"86356","45aa7127":"86476",ce66b6fd:"86518","7dd3be25":"86826","4e1da517":"86950",a6a8af40:"87064","27bd5328":"87223","6fc8d865":"87224","1fdab62e":"87240","9980f90c":"87304","96c0bb00":"87313",a48778d9:"87316","96ec050b":"87443","7668acae":"87460","8faa0fb1":"87482",a291f403:"87513",f807eec9:"87634",c07f2717:"87667","3958a146":"87799",e111f111:"87836",b1998bb1:"87866",e5842021:"88179","1e391540":"88187",c0074ddd:"88252","21ad5224":"88295",e1b9986a:"88338",ac930f6e:"88380","3c725018":"88446","6827856d":"88598","4499569c":"88621","6f59957c":"88625","41db9914":"88821","1c56d006":"88831","32ea4ecb":"88879","4455e85b":"89002",b4028749:"89210","13b69fa8":"89574","46c600d5":"89780","443045da":"89806","1f79049f":"89852",fee1f25c:"89986",d25ffd5f:"89987",d043cc46:"90046","41b3e733":"90185",f2497893:"90333","2c91f584":"90342",f60e43ec:"90392","0b78393d":"90398",dd313590:"90431","0a13c98e":"90451",b2335bc1:"90464","73dfc993":"90536","01fb8b11":"90560","22f40a40":"90601","147b0f6a":"90610","8cd0f4f5":"90615","6601f604":"90645","459a783a":"90666","87e7806e":"90865","4302562a":"90896","6eb0ce42":"90976","9c4bbfc4":"91178",ff0539a2:"91213","28b27838":"91231","76ace0dc":"91274","02ee0502":"91287","872e63de":"91304","4cd7d8af":"91406","1671b3fa":"91425",d692bb25:"91523",c839a5b0:"91628","2f535455":"91753","304ed800":"91782",b2735041:"91849","000c061a":"92085","8c828746":"92244","9c42de85":"92269",db5c8692:"92393","799b872c":"92404",d6360c39:"92456","4d4093bb":"92463","6eebf72d":"92744","7861f6df":"92775","8c31caf6":"92778",ae5bb339:"92786","85c3ba36":"92843","8bfba65b":"92851","14e00221":"92964",b984322c:"93023","61e5c5b8":"93071",e7cbe8da:"93151",f7101d4f:"93176","740eb29c":"93195",b83df1bc:"93308","5d075efb":"93340",f7735fb0:"93346",dd435828:"93377","03e8549c":"93400",dede40b0:"93590","4e6907d6":"93749","917734f8":"93832",cb341380:"93837",c9aea766:"94114","7c8407dd":"94123","91dc98f0":"94136","37aba5d3":"94197","43b891d1":"94223","63f66cb7":"94328","9fdf7324":"94337","6c10648f":"94401","878356ab":"94452","487f7f30":"94605",d3e690ce:"94694","376d31f7":"94696",a233fb97:"94932",b8e39b95:"95020",d666ab7e:"95107","3db8c88b":"95171",bc08bf79:"95281","9936b6c8":"95296",cf282674:"95317","1e173bbe":"95327","5b23c695":"95329","41fbfe2f":"95364","7877b0eb":"95418",e9ef6b31:"95441","0e0f5dd2":"95561","8462ad7a":"95696",edf19300:"95745",e490fd18:"95801","7e254f9d":"95911","90b0cf6d":"95945","8fa500ae":"96055",d6011437:"96078",a322018d:"96082","3061ad92":"96135",f0129862:"96188",ebf2bdda:"96361","64bd79cb":"96426","38e65fdd":"96535","49ea6ca5":"96544","385bc71d":"96547",e23cd647:"96617",a612420b:"96684",b35418cf:"96772","99ba663e":"96831","09e11ac0":"96945","57973c2b":"96971","7f6f8f16":"96979","6816f4c0":"97065",f3034cf4:"97129","9d4bcb9a":"97334",d91e7ab4:"97469","02fbc840":"97523","902fdb3b":"97547","7ea214d5":"97553",ed97cef0:"97617",b094b997:"97782","7513b789":"97816","16cff1eb":"97826",dd6685df:"97850","1a4e3797":"97920","746bf890":"97955","049dc708":"98177","0e7f2915":"98200","1820eb3b":"98218",b7f629d0:"98272",ced65f67:"98623",d1475ab1:"98740","1a6f209f":"98791","6a913ab1":"98868","3ff950a4":"98939","008b0ccc":"99120","8aecb2ef":"99184",ca443c18:"99266","00125b11":"99367",c2f4aca4:"99389","64758f43":"99427",f2d5637b:"99494","49ea4a42":"99607","32db5af4":"99669","15d4dc80":"99839","5e3def70":"99871",b63b5bb9:"99997"}[e]||e,r.p+r.u(e)},(()=>{var e={51303:0,40532:0};r.f.j=(b,f)=>{var c=r.o(e,b)?e[b]:void 0;if(0!==c)if(c)f.push(c[2]);else if(/^(40532|51303)$/.test(b))e[b]=0;else{var a=new Promise(((f,a)=>c=e[b]=[f,a]));f.push(c[2]=a);var d=r.p+r.u(b),t=new Error;r.l(d,(f=>{if(r.o(e,b)&&(0!==(c=e[b])&&(e[b]=void 0),c)){var a=f&&("load"===f.type?"missing":f.type),d=f&&f.target&&f.target.src;t.message="Loading chunk "+b+" failed.\n("+a+": "+d+")",t.name="ChunkLoadError",t.type=a,t.request=d,c[1](t)}}),"chunk-"+b,b)}},r.O.j=b=>0===e[b];var b=(b,f)=>{var c,a,[d,t,o]=f,n=0;if(d.some((b=>0!==e[b]))){for(c in t)r.o(t,c)&&(r.m[c]=t[c]);if(o)var i=o(r)}for(b&&b(f);n Contributing a Task | Cumulus Documentation - +
Version: v15.0.2

Contributing a Task

We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

The tasks.md file is generated in the build process

The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

Do not edit the docs/tasks.md file directly.

- + \ No newline at end of file diff --git a/docs/api/index.html b/docs/api/index.html index 3ca8838394e..92d9beaac8a 100644 --- a/docs/api/index.html +++ b/docs/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/architecture/index.html b/docs/architecture/index.html index a801bf438d3..a1d484450d3 100644 --- a/docs/architecture/index.html +++ b/docs/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
Version: v15.0.2

Architecture

Architecture

Below, find a diagram with the components that comprise an instance of Cumulus.

Architecture diagram of a Cumulus deployment

This diagram details all of the major architectural components of a Cumulus deployment.

While the diagram can feel complex, it can easily be digested in several major components:

Data Distribution

End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

End user exposure of Cumulus's holdings is expected to be provided by an external service.

For NASA use, this is assumed to be CMR in this diagram.

Data ingest

Workflows

The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

Data persistence

Cumulus entity state data is stored in a set of PostgreSQL compatible database, and is exported to an Elasticsearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries. Currently the entity state data is replicated in DynamoDB and this will be removed in a future release.

Data discovery

Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

Database

Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance. For legacy reasons, Cumulus is currently also deploying a DynamoDB datastore, and writes are replicated in parallel with the PostgreSQL database writes. The DynamoDB replicated tables and parallel writes will be removed in future releases.

PostgreSQL Database Schema Diagram

ERD of the Cumulus Database

Maintenance

System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

Deployment Structure

Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

Cumulus

The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

Data persistence

The data persistence module provides the Data Persistence portion of the diagram.

Other modules

Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

- + \ No newline at end of file diff --git a/docs/configuration/cloudwatch-retention/index.html b/docs/configuration/cloudwatch-retention/index.html index 448bfdcf222..aced97aa4dd 100644 --- a/docs/configuration/cloudwatch-retention/index.html +++ b/docs/configuration/cloudwatch-retention/index.html @@ -5,7 +5,7 @@ Cloudwatch Retention | Cumulus Documentation - + @@ -14,7 +14,7 @@ the retention period (in days) of cloudwatch log groups for lambdas and tasks which the cumulus, cumulus_distribution, and cumulus_ecs_service modules supports (using the cumulus module as an example):

module "cumulus" {
# ... other variables
default_log_retention_days = var.default_log_retention_days
cloudwatch_log_retention_periods = var.cloudwatch_log_retention_periods
}

By setting the below variables in terraform.tfvars and deploying, the cloudwatch log groups will be instantiated or updated with the new retention value.

default_log_retention_periods

The variable default_log_retention_days can be configured in order to set the default log retention for all cloudwatch log groups managed by Cumulus in case a custom value isn't used. The log groups will use this value for their retention, and if this value is not set either, the retention will default to 30 days. For example, if a user would like their log groups of the Cumulus module to have a retention period of one year, deploy the respective modules with the variable in the example below.

Example

default_log_retention_periods = 365

cloudwatch_log_retention_periods

The retention period (in days) of cloudwatch log groups for specific lambdas and tasks can be set during deployment using the cloudwatch_log_retention_periods terraform map variable. In order to configure these values for respective cloudwatch log groups, uncomment the cloudwatch_log_retention_periods variable and add the retention values listed below corresponding to the group's retention you want to change. The following values are supported correlating to their lambda/task name, (i.e. "/aws/lambda/prefix-DiscoverPdrs" would have the retention variable "DiscoverPdrs" )

  • ApiEndpoints
  • AsyncOperationEcsLogs
  • DiscoverPdrs
  • DistributionApiEndpoints
  • EcsLogs
  • granuleFilesCacheUpdater
  • HyraxMetadataUpdates
  • ParsePdr
  • PostToCmr
  • PrivateApiLambda
  • publishExecutions
  • publishGranules
  • publishPdrs
  • QueuePdrs
  • QueueWorkflow
  • replaySqsMessages
  • SyncGranule
  • UpdateCmrAccessConstraints
note

EcsLogs is used for all cumulus_ecs_service tasks cloudwatch log groups

Example

cloudwatch_log_retention_periods = {
ParsePdr = 365
}

The retention periods are the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

- + \ No newline at end of file diff --git a/docs/configuration/collection-storage-best-practices/index.html b/docs/configuration/collection-storage-best-practices/index.html index 7414af36b50..e4cf0e4c186 100644 --- a/docs/configuration/collection-storage-best-practices/index.html +++ b/docs/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
Version: v15.0.2

Collection Cost Tracking and Storage Best Practices

Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

Calculating Storage By Collection

By bucket

Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

Alternatively you can query CloudWatch using the CLI.

This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

The result looks like:

{
"Datapoints": [
{
"Timestamp": "2019-07-31T00:00:00Z",
"Average": 150996467959.0,
"Unit": "Bytes"
}
],
"Label": "BucketSizeBytes"
}

By key prefix

AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

Note that this can be a long-running operation for large buckets.

Calculating Cost By Collection

NASA NGAP Environment

If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

Outside of NGAP

You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

Storage Configuration

Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

Organizing By Bucket

You can specify separate groups of buckets for each collection, which could look like the example below.

{
"name": "MOD09GQ",
"version": "006",
"granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
"files": [
{
"bucket": "MOD09GQ-006-protected",
"regex": "^.*\\.hdf$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
},
{
"bucket": "MOD09GQ-006-private",
"regex": "^.*\\.hdf\\.met$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
},
{
"bucket": "MOD09GQ-006-protected",
"regex": "^.*\\.cmr\\.xml$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
},
{
"bucket": "MOD09GQ-006-public",
"regex": "^*\\.jpg$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
}
]
}

Additional collections would go to different buckets.

Organizing by Key Prefix

Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

{
"name": "MOD09GQ",
"version": "006",
"granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
"url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
"files": [
{
"bucket": "protected",
"regex": "^.*\\.hdf$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
},
{
"bucket": "private",
"regex": "^.*\\.hdf\\.met$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
},
{
"bucket": "protected",
"regex": "^.*\\.cmr\\.xml$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
},
{
"bucket": "public",
"regex": "^*\\.jpg$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
}
]
}

In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

The url_path can be overidden directly on the file configuration. The example below produces the same result.

{
"name": "MOD09GQ",
"version": "006",
"granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
"files": [
{
"bucket": "protected",
"regex": "^.*\\.hdf$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
"url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
},
{
"bucket": "private",
"regex": "^.*\\.hdf\\.met$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
"url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
},
{
"bucket": "protected-2",
"regex": "^.*\\.cmr\\.xml$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
"url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
},
{
"bucket": "public",
"regex": "^*\\.jpg$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
"url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
}
]
}
- + \ No newline at end of file diff --git a/docs/configuration/data-management-types/index.html b/docs/configuration/data-management-types/index.html index 46aaef751cf..c549bd51169 100644 --- a/docs/configuration/data-management-types/index.html +++ b/docs/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
Version: v15.0.2

Cumulus Data Management Types

What Are The Cumulus Data Management Types

  • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
  • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
  • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
  • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
  • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
  • Executions: Executions are records of a workflow.
  • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

Interaction

  • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
  • Collections tell Cumulus where to store the data files
  • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

Managing Data Management Types

The following are created via the dashboard or API:

  • Providers
  • Collections
  • Rules
  • Reconciliation reports

Granules are created by workflow executions and then can be managed via the dashboard or API.

An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

Workflows are created and managed via the Cumulus deployment.

Configuration Fields

Schemas

Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

Providers

Please note:

  • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
  • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

Provider Configuration

The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

Configuration by provider type
S3
KeyTypeRequiredDescription
idstringYesUnique identifier for the provider
globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
hoststringYesS3 Bucket to pull data from
http
KeyTypeRequiredDescription
idstringYesUnique identifier for the provider
globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
protocolstringYesThe protocol for this provider. Must be http for this provider type
hoststringYesThe host to pull data from (e.g. nasa.gov)
usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
portintegerNoPort to connect to the provider on. Defaults to 80
allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
certificateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
https
KeyTypeRequiredDescription
idstringYesUnique identifier for the provider
globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
protocolstringYesThe protocol for this provider. Must be https for this provider type
hoststringYesThe host to pull data from (e.g. nasa.gov)
usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
portintegerNoPort to connect to the provider on. Defaults to 443
allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
certiciateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
ftp
KeyTypeRequiredDescription
idstringYesUnique identifier for the provider
globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
protocolstringYesThe protocol for this provider. Must be ftp for this provider type
hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
portintegerNoPort to connect to the provider on. Defaults to 21
sftp
KeyTypeRequiredDescription
idstringYesUnique identifier for the provider
globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
protocolstringYesThe protocol for this provider. Must be sftp for this provider type
hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
usernamestringNoUsername to use to connect to the sftp server.
passwordstringNoPassword to use to connect to the sftp server.
portintegerNoPort to connect to the provider on. Defaults to 22
privateKeystringNofilename assumed to be in s3://bucketInternal/stackName/crypto
cmKeyIdstringNoAWS KMS Customer Master Key arn or alias

Collections

Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
KeyValueRequiredDescription
name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
version"006"YesA version tag for the collection
granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
url_path"{cmrMetadata.Granule.Collection.ShortName}/
{substring(file.fileName, 0, 3)}"
NoFilename without extension

files-object

KeyValueRequiredDescription
regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
bucket"internal"YesName of the bucket where the file will be stored
url_path"${collectionShortName}/{substring(file.fileName, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

Rules

Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

Rule configuration
KeyValueRequiredDescription
name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
payload<JSON Object or Array>NoThe payload to be passed to the workflow
meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

collection-object

KeyValueRequiredDescription
name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

meta-object

KeyValueRequiredDescription
retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

rule-object

KeyValueRequiredDescription
type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
value<String> ObjectDependsDiscussion of valid values is below

rule-value

The rule - value entry depends on the type of run:

  • If this is a onetime rule this can be left blank. Example
  • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
  • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
  • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
  • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

sqs-type rule features

  • When an SQS rule is triggered, the SQS message remains on the queue.
  • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
  • The SQS message visibility timeout can be overridden by the rule.
  • Upon successful workflow execution, the SQS message is removed from the queue.
  • Upon failed execution(s), the workflow is run 3 or configured number of times.
  • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
  • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

Configuration Via Cumulus Dashboard

Create A Provider

  • In the Cumulus dashboard, go to the Provider page.

Screenshot of Create Provider form

  • Click on Add Provider.
  • Fill in the form and then submit it.

Screenshot of Create Provider form

Create A Collection

  • Go to the Collections page.

Screenshot of the Collections page

  • Click on Add Collection.
  • Copy and paste or fill in the collection JSON object form.

Screenshot of Add Collection form

  • Once you submit the form, you should be able to verify that your new collection is in the list.

Create A Rule

  1. Go To Rules Page
  • Go to the Cumulus dashboard, click on Rules in the navigation.
  • Click Add Rule.

Screenshot of Rules page

  1. Complete Form
  • Fill out the template form.

Screenshot of a Rules template for adding a new rule

For more details regarding the field definitions and required information go to Data Cookbooks.

Note: If the state field is left blank, it defaults to false.

Rule Examples

  • A rule form with completed required fields:

Screenshot of a completed rule form

  • A successfully added Rule:

Screenshot of created rule

- + \ No newline at end of file diff --git a/docs/configuration/lifecycle-policies/index.html b/docs/configuration/lifecycle-policies/index.html index 05b25a020bd..76da8c05d3c 100644 --- a/docs/configuration/lifecycle-policies/index.html +++ b/docs/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
Version: v15.0.2

Setting S3 Lifecycle Policies

This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

Requirements

  • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
  • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
  • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

Examples

Walk-through on setting time-based S3 Infrequent Access (S3IA) bucket policy

This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

Command Line

Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

Create policy

From any directory you chose, open an editor and add the following to a file named exampleRule.json

{
"Rules": [
{
"Status": "Enabled",
"Filter": {
"Prefix": ""
},
"Transitions": [
{
"Days": 90,
"StorageClass": "STANDARD_IA"
}
],
"NoncurrentVersionTransitions": [
{
"NoncurrentDays": 90,
"StorageClass": "STANDARD_IA"
}
]
"ID": "90DayS3IAExample"
}
]
}

Set policy

On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

Verify policy has been set

To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

 $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
{
"Rules": [
{
"Status": "Enabled",
"Filter": {
"Prefix": ""
},
"Transitions": [
{
"Days": 90,
"StorageClass": "STANDARD_IA"
}
],
"NoncurrentVersionTransitions": [
{
"NoncurrentDays": 90,
"StorageClass": "STANDARD_IA"
}
]
"ID": "90DayS3IAExample"
}
]
}

You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

Management Console

Create Policy

To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

You should see a screen similar to:

Screenshot of AWS console for an S3 bucket

Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

Click next, and mark Current Version and Previous Versions.

Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

You should now see you have a rule configured for your bucket:

Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

Additional Information

This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

Strategy Overview

For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

AWS Documentation

The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

- + \ No newline at end of file diff --git a/docs/configuration/monitoring-readme/index.html b/docs/configuration/monitoring-readme/index.html index 29d5542823c..8631d0e0a47 100644 --- a/docs/configuration/monitoring-readme/index.html +++ b/docs/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
Version: v15.0.2

Monitoring Best Practices

This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

Cumulus-provided resources and integrations for monitoring

Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

Cumulus Dashboard

The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

Cumulus-provided AWS resources

Cumulus sets up CloudWatch log groups for all Core-provided tasks.

Monitoring Lambda Functions

Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

Monitoring ECS services

Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

Monitoring workflows

For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

AWS recommendations

AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

Example: Setting up email notifications for CloudWatch logs

Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

const zlib = require('zlib');
const aws = require('aws-sdk');
const { promisify } = require('util');

const gunzip = promisify(zlib.gunzip);
const sns = new aws.SNS();

exports.handler = async (event) => {
const payload = Buffer.from(event.awslogs.data, 'base64');
const decompressedData = await gunzip(payload);
const logData = JSON.parse(decompressedData.toString('ascii'));
return await Promise.all(logData.logEvents.map(async (logEvent) => {
const logMessage = JSON.parse(logEvent.message);
if (['error', 'fatal'].includes(logMessage.level)) {
return sns.publish({
TopicArn: process.env.EmailReportingTopicArn,
Message: logEvent.message
}).promise();
}
return Promise.resolve();
}));
};

After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

- + \ No newline at end of file diff --git a/docs/configuration/server_access_logging/index.html b/docs/configuration/server_access_logging/index.html index 6e8254b1cf8..d87d0eab6a1 100644 --- a/docs/configuration/server_access_logging/index.html +++ b/docs/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
Version: v15.0.2

S3 Server Access Logging

Via AWS Console

Enable server access logging for an S3 bucket

Via AWS Command Line Interface

  1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

    {
    "LoggingEnabled": {
    "TargetBucket": "<stack-internal-bucket>",
    "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
    }
    }
  2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

    aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
  3. Verify the logging policy exists on your buckets.

    aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
- + \ No newline at end of file diff --git a/docs/configuration/task-configuration/index.html b/docs/configuration/task-configuration/index.html index 915f19788b9..1b74eac812b 100644 --- a/docs/configuration/task-configuration/index.html +++ b/docs/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
Version: v15.0.2

Configuration of Tasks

The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

cmr_search_client_config

Configuration parameters for CMR search client for cumulus archive module tasks in the form:

<lambda_identifier>_report_cmr_limit = <maximum number records can be returned from cmr-client search, this should be greater than cmr_page_size>
<lambda_identifier>_report_cmr_page_size = <number of records for each page returned from CMR>
type = map(string)

More information about cmr limit and cmr page_size can be found from @cumulus/cmr-client and CMR Search API document.

Currently the following values are supported:

  • create_reconciliation_report_cmr_limit
  • create_reconciliation_report_cmr_page_size

Example

cmr_search_client_config = {
create_reconciliation_report_cmr_limit = 2500
create_reconciliation_report_cmr_page_size = 250
}

elasticsearch_client_config

Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

<lambda_identifier>_es_scroll_duration = <duration>
<lambda_identifier>_es_scroll_size = <size>
type = map(string)

Currently the following values are supported:

  • create_reconciliation_report_es_scroll_duration
  • create_reconciliation_report_es_scroll_size

Example

elasticsearch_client_config = {
create_reconciliation_report_es_scroll_duration = "15m"
create_reconciliation_report_es_scroll_size = 2000
}

lambda_timeouts

A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

<lambda_identifier>_timeout: <timeout>
type = map(string)

Currently the following values are supported:

  • add_missing_file_checksums_task_timeout
  • discover_granules_task_timeout
  • discover_pdrs_task_timeout
  • fake_processing_task_timeout
  • files_to_granules_task_timeout
  • hello_world_task_timeout
  • hyrax_metadata_update_tasks_timeout
  • lzards_backup_task_timeout
  • move_granules_task_timeout
  • parse_pdr_task_timeout
  • pdr_status_check_task_timeout
  • post_to_cmr_task_timeout
  • queue_granules_task_timeout
  • queue_pdrs_task_timeout
  • queue_workflow_task_timeout
  • sf_sqs_report_task_timeout
  • sync_granule_task_timeout
  • update_granules_cmr_metadata_file_links_task_timeout

Example

lambda_timeouts = {
discover_granules_task_timeout = 300
}

lambda_memory_sizes

A configurable map of memory sizes (in MBs) for cumulus ingest module task lambdas in the form:

<lambda_identifier>_memory_size: <memory_size>
type = map(string)

Currently the following values are supported:

  • add_missing_file_checksums_task_memory_size
  • discover_granules_task_memory_size
  • discover_pdrs_task_memory_size
  • fake_processing_task_memory_size
  • hyrax_metadata_updates_task_memory_size
  • lzards_backup_task_memory_size
  • move_granules_task_memory_size
  • parse_pdr_task_memory_size
  • pdr_status_check_task_memory_size
  • post_to_cmr_task_memory_size
  • queue_granules_task_memory_size
  • queue_pdrs_task_memory_size
  • queue_workflow_task_memory_size
  • sf_sqs_report_task_memory_size
  • sync_granule_task_memory_size
  • update_cmr_acess_constraints_task_memory_size
  • update_granules_cmr_metadata_file_links_task_memory_size

Example

lambda_memory_sizes = {
queue_granules_task_memory_size = 1036
}
- + \ No newline at end of file diff --git a/docs/data-cookbooks/about-cookbooks/index.html b/docs/data-cookbooks/about-cookbooks/index.html index 33c46471e5c..a0ee0cfe77f 100644 --- a/docs/data-cookbooks/about-cookbooks/index.html +++ b/docs/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
Version: v15.0.2

About Cookbooks

Introduction

The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

Setup

The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to configure Cumulus data management types.

Adding a page

As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

More about workflows

Workflow general information

Input & Output

Developing Workflow Tasks

Workflow Configuration How-to's

- + \ No newline at end of file diff --git a/docs/data-cookbooks/browse-generation/index.html b/docs/data-cookbooks/browse-generation/index.html index caa82ba4f4b..dc5a0a05961 100644 --- a/docs/data-cookbooks/browse-generation/index.html +++ b/docs/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

{
"name": "TestBrowseGeneration",
"workflow": "DiscoverGranulesBrowseExample",
"provider": "{{provider_from_previous_step}}",
"collection": {
"name": "MOD09GQ",
"version": "006"
},
"meta": {
"provider_path": "{{path_to_data}}"
},
"rule": {
"type": "onetime"
},
"state": "ENABLED",
"updatedAt": 1553053438767
}

Run Workflows

Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

Go to the Rules tab, click the rule you just created:

Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

Then click the gear in the upper right corner and click "Rerun":

Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

Screenshot of page listing executions in the Cumulus dashboard

Results

You can verify your data has ingested by clicking the successful workflow entry:

Screenshot of individual entry from table listing executions in the Cumulus dashboard

Select "Show Output" on the next page

Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

and you should see in the payload from the workflow something similar to:

"payload": {
"process": "modis",
"granules": [
{
"files": [
{
"fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"type": "data",
"bucket": "cumulus-test-sandbox-protected",
"path": "data",
"url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
"size": 1908635
},
{
"fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
"key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
"type": "metadata",
"bucket": "cumulus-test-sandbox-private",
"path": "data",
"url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}",
"size": 21708
},
{
"fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
"key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
"type": "browse",
"bucket": "cumulus-test-sandbox-protected",
"path": "data",
"url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
"size": 1908635
},
{
"fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
"key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
"type": "metadata",
"bucket": "cumulus-test-sandbox-protected-2",
"url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}"
}
],
"cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
"cmrConceptId": "G1222231611-CUMULUS",
"granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
"cmrMetadataFormat": "echo10",
"dataType": "MOD09GQ",
"version": "006",
"published": true
}
]
}

You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


Build Processing Lambda

This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

The discussion below outlines requirements for this lambda.

Inputs

The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

Configuration

  • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

  • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

Payload

The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

 "payload": {
"process": "modis",
"granules": [
{
"granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
"dataType": "MOD09GQ",
"version": "006",
"files": [
{
"fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"bucket": "cumulus-test-sandbox-internal",
"key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"size": 1908635
},
{
"fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
"bucket": "cumulus-test-sandbox-internal",
"key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
"size": 21708
}
]
}
]
}

Generating Browse Imagery

The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

The processing lambda you construct will need to do the following:

  • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
  • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
  • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

Generating/updating CMR metadata

If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

Expected Outputs for processing task/tasks

In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

          "task_config": {
"inputGranules": "{$.meta.input_granules}",
"granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
}

Their expected values from the example above may be useful in constructing a processing task:

payload

The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

  "payload": [
"s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
"s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
"s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
]

This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

input_granules

The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

"input_granules": [
{
"granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
"dataType": "MOD09GQ",
"version": "006",
"files": [
{
"fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"bucket": "cumulus-test-sandbox-internal",
"key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"size": 1908635
},
{
"fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
"bucket": "cumulus-test-sandbox-internal",
"key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
"size": 21708
},
{
"fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
"bucket": "cumulus-test-sandbox-internal",
"key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg"
}
]
}
],
- + \ No newline at end of file diff --git a/docs/data-cookbooks/choice-states/index.html b/docs/data-cookbooks/choice-states/index.html index 317260727d2..694e3faeaa4 100644 --- a/docs/data-cookbooks/choice-states/index.html +++ b/docs/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
Version: v15.0.2

Choice States

Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

If the comparison evaluates to true, the Next state is followed.

Example

In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

The CheckAgainChoice state definition requires an input object of the following structure:

{
"meta": {
"isPdrFinished": false
}
}

Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

"CheckAgainChoice": {
"Type": "Choice",
"Choices": [
{
"Variable": "$.meta.isPdrFinished",
"BooleanEquals": false,
"Next": "PdrStatusReport"
},
{
"Variable": "$.meta.isPdrFinished",
"BooleanEquals": true,
"Next": "WorkflowSucceeded"
}
],
"Default": "WorkflowSucceeded"
}

Advanced: Loops in Cumulus Workflows

Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

Further documentation

For complete details on Choice state configuration options, see the Choice state documentation.

- + \ No newline at end of file diff --git a/docs/data-cookbooks/cnm-workflow/index.html b/docs/data-cookbooks/cnm-workflow/index.html index 907f634801b..7a6b6ef2680 100644 --- a/docs/data-cookbooks/cnm-workflow/index.html +++ b/docs/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
Version: v15.0.2

CNM Workflow

This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

Sections


Prerequisites

Cumulus

This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

AWS CLI

This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

Kinesis

This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

Screenshot of AWS console page for creating a Kinesis stream

Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

Source Data

This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

Collection and Provider

Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


Configure the Workflow

Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

The following are steps that are required to set up your Cumulus instance to run the example workflow:

Example CNM Workflow

In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/cnm_workflow.tf.

Add the following to the new terraform file in your deployment directory, updating the following:

  • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
  • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
module "cnm_workflow" {
source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

prefix = var.prefix
name = "CNMExampleWorkflow"
workflow_config = module.cumulus.workflow_config
system_bucket = var.system_bucket

{
state_machine_definition = <<JSON
"CNMExampleWorkflow": {
"Comment": "CNMExampleWorkflow",
"StartAt": "TranslateMessage",
"States": {
"TranslateMessage": {
"Type": "Task",
"Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
"Parameters": {
"cma": {
"event.$": "$",
"task_config": {
"collection": "{$.meta.collection}",
"cumulus_message": {
"outputs": [
{
"source": "{$.cnm}",
"destination": "{$.meta.cnm}"
},
{
"source": "{$}",
"destination": "{$.payload}"
}
]
}
}
}
},
"Catch": [
{
"ErrorEquals": [
"States.ALL"
],
"ResultPath": "$.exception",
"Next": "CnmResponse"
}
],
"Next": "SyncGranule"
},
"SyncGranule": {
"Parameters": {
"cma": {
"event.$": "$",
"task_config": {
"provider": "{$.meta.provider}",
"buckets": "{$.meta.buckets}",
"collection": "{$.meta.collection}",
"downloadBucket": "{$.meta.buckets.private.name}",
"stack": "{$.meta.stack}",
"cumulus_message": {
"outputs": [
{
"source": "{$.granules}",
"destination": "{$.meta.input_granules}"
},
{
"source": "{$}",
"destination": "{$.payload}"
}
]
}
}
}
},
"Type": "Task",
"Resource": "${module.cumulus.sync_granule_task.task_arn}",
"Retry": [
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 10,
"MaxAttempts": 3
}
],
"Catch": [
{
"ErrorEquals": [
"States.ALL"
],
"ResultPath": "$.exception",
"Next": "CnmResponse"
}
],
"Next": "CnmResponse"
},
"CnmResponse": {
"Parameters": {
"cma": {
"event.$": "$",
"task_config": {
"OriginalCNM": "{$.meta.cnm}",
"distribution_endpoint": "{$.meta.distribution_endpoint}",
"response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
"region": "us-east-1",
"type": "kinesis",
"WorkflowException": "{$.exception}",
"cumulus_message": {
"outputs": [
{
"source": "{$.cnm}",
"destination": "{$.meta.cnmResponse}"
},
{
"source": "{$.input.input}",
"destination": "{$.payload}"
}
]
}
}
}
},
"Type": "Task",
"Resource": "${aws_lambda_function.cnm_response_task.arn}",
"Retry": [
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 5,
"MaxAttempts": 3
}
],
"End": true
}
}
}
}
JSON

Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

Lambda Configuration

To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

Below is a description of each of these tasks:

CNMToCMA

CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

CnmResponse

The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

Additional Tasks

Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

Redeploy

Once the above configuration changes have been made, redeploy your stack.

Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

Rule Configuration

Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

{
"collection": {
"name": "L2_HR_PIXC",
"version": "000"
},
"name": "L2_HR_PIXC_kinesisRule",
"provider": "PODAAC_SWOT",
"rule": {
"type": "kinesis",
"value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
},
"state": "ENABLED",
"workflow": "CNMExampleWorkflow"
}

Please Note:

  • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
  • The collection and provider should match the collection and provider you setup in the Prerequisites section.

Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


Execute the Workflow

Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

How to Trigger the Workflow

To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

Create Record JSON

Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

  • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
  • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
  • COLLECTION: The collection name defined in the prerequisites for this product
{
"product": {
"files": [
{
"checksumType": "md5",
"name": "${TEST_DATA_FILE_NAME}",
"checksum": "bogus_checksum_value",
"uri": "${TEST_DATA_URI}",
"type": "data",
"size": 12345678
}
],
"name": "${TEST_DATA_FILE_NAME}",
"dataVersion": "006"
},
"identifier ": "testIdentifier123456",
"collection": "${COLLECTION}",
"provider": "TestProvider",
"version": "001",
"submissionTime": "2017-09-30T03:42:29.791198"
}

Add Record to Kinesis Data Stream

Using the JSON file you created, push it to the Kinesis notification stream:

aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

Please note: The above command uses the stream name, not the ARN.

The command should return output similar to:

{
"ShardId": "shardId-000000000000",
"SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
}

This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

Verify Workflow Execution

As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

TranslateMessage

TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

For more on the Message Adapter, please see the Message Flow documentation.

An example of what is happening in the CNMToCMA Lambda is as follows:

Example Input Payload:

"payload": {
"identifier ": "testIdentifier123456",
"product": {
"files": [
{
"checksumType": "md5",
"name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"checksum": "bogus_checksum_value",
"uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"type": "data",
"size": 12345678
}
],
"name": "TestGranuleUR",
"dataVersion": "006"
},
"version": "123456",
"collection": "MOD09GQ",
"provider": "TestProvider",
"submissionTime": "2017-09-30T03:42:29.791198"
}

Example Output Payload:

  "payload": {
"cnm": {
"identifier ": "testIdentifier123456",
"product": {
"files": [
{
"checksumType": "md5",
"name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"checksum": "bogus_checksum_value",
"uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"type": "data",
"size": 12345678
}
],
"name": "TestGranuleUR",
"dataVersion": "006"
},
"version": "123456",
"collection": "MOD09GQ",
"provider": "TestProvider",
"submissionTime": "2017-09-30T03:42:29.791198",
"receivedTime": "2017-09-30T03:42:31.634552"
},
"output": {
"granules": [
{
"granuleId": "TestGranuleUR",
"files": [
{
"path": "some-bucket/data",
"url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"bucket": "some-bucket",
"name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
"size": 12345678
}
]
}
]
}
}

SyncGranules

This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

CnmResponse

Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

The data written to the response-endpoint should adhere to the Response Message Fields schema.

Example CNM Success Response:

{
"provider": "PODAAC_SWOT",
"collection": "SWOT_Prod_l2:1",
"processCompleteTime": "2017-09-30T03:45:29.791198",
"submissionTime": "2017-09-30T03:42:29.791198",
"receivedTime": "2017-09-30T03:42:31.634552",
"identifier": "1234-abcd-efg0-9876",
"response": {
"status": "SUCCESS"
}
}

Example CNM Error Response:

{
"provider": "PODAAC_SWOT",
"collection": "SWOT_Prod_l2:1",
"processCompleteTime": "2017-09-30T03:45:29.791198",
"submissionTime": "2017-09-30T03:42:29.791198",
"receivedTime": "2017-09-30T03:42:31.634552",
"identifier": "1234-abcd-efg0-9876",
"response": {
"status": "FAILURE",
"errorCode": "PROCESSING_ERROR",
"errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
}
}

Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

To test the failure scenario, send a record missing the product.name key.


Verify results

Check for successful execution on the dashboard

Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

Check the test granule has been delivered to S3 staging

The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

Check for Kinesis records

A SUCCESS notification should be present on the response-endpoint Kinesis stream.

You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

Get a shard iterator (substituting your stream name as appropriate):

aws kinesis get-shard-iterator \
--shard-id shardId-000000000000 \
--shard-iterator-type LATEST \
--stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

which should result in an output to:

{
"ShardIterator": "VeryLongString=="
}
  • Re-trigger the workflow by using the put-record command from
  • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

This should result in output similar to:

{
"Records": [
{
"SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
"ApproximateArrivalTimestamp": 1532664689.128,
"Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
"PartitionKey": "1"
},
{
"SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
"ApproximateArrivalTimestamp": 1532664707.149,
"Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
"PartitionKey": "1"
}
],
"NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
"MillisBehindLatest": 0
}

Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

Successful CNM Response Object Example:

{
"cnmResponse": {
"provider": "TestProvider",
"collection": "MOD09GQ",
"version": "123456",
"processCompleteTime": "2017-09-30T03:45:29.791198",
"submissionTime": "2017-09-30T03:42:29.791198",
"receivedTime": "2017-09-30T03:42:31.634552",
"identifier ": "testIdentifier123456",
"response": {
"status": "SUCCESS"
}
}
}

Kinesis Record Error Handling

messageConsumer

The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

Kinesis Stream logging

Notification Stream messages

Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

Response Stream messages

Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

- + \ No newline at end of file diff --git a/docs/data-cookbooks/error-handling/index.html b/docs/data-cookbooks/error-handling/index.html index 5fbc8799729..758408c4dea 100644 --- a/docs/data-cookbooks/error-handling/index.html +++ b/docs/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

Example state machine definition:

{
"Comment": "Tests Workflow from Kinesis Stream",
"StartAt": "TranslateMessage",
"States": {
"TranslateMessage": {
"Parameters": {
"cma": {
"event.$": "$",
"task_config": {
"cumulus_message": {
"outputs": [
{
"source": "{$.cnm}",
"destination": "{$.meta.cnm}"
},
{
"source": "{$}",
"destination": "{$.payload}"
}
]
}
}
}
},
"Type": "Task",
"Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
],
"Catch": [
{
"ErrorEquals": ["States.ALL"],
"ResultPath": "$.exception",
"Next": "CnmResponseFail"
}
],
"Next": "SyncGranule"
},
"SyncGranule": {
"Parameters": {
"cma": {
"event.$": "$",
"ReplaceConfig": {
"Path": "$.payload",
"TargetPath": "$.payload"
},
"task_config": {
"provider": "{$.meta.provider}",
"buckets": "{$.meta.buckets}",
"collection": "{$.meta.collection}",
"downloadBucket": "{$.meta.buckets.private.name}",
"stack": "{$.meta.stack}",
"cumulus_message": {
"outputs": [
{
"source": "{$.granules}",
"destination": "{$.meta.input_granules}"
},
{
"source": "{$}",
"destination": "{$.payload}"
}
]
}
}
}
},
"Type": "Task",
"Resource": "${module.cumulus.sync_granule_task.task_arn}",
"Retry": [
{
"ErrorEquals": ["States.ALL"],
"IntervalSeconds": 10,
"MaxAttempts": 3
}
],
"Catch": [
{
"ErrorEquals": ["States.ALL"],
"ResultPath": "$.exception",
"Next": "CnmResponseFail"
}
],
"Next": "CnmResponse"
},
"CnmResponse": {
"Parameters": {
"cma": {
"event.$": "$",
"task_config": {
"OriginalCNM": "{$.meta.cnm}",
"CNMResponseStream": "{$.meta.cnmResponseStream}",
"region": "us-east-1",
"WorkflowException": "{$.exception}",
"cumulus_message": {
"outputs": [
{
"source": "{$}",
"destination": "{$.meta.cnmResponse}"
},
{
"source": "{$}",
"destination": "{$.payload}"
}
]
}
}
}
},
"Type": "Task",
"Resource": "${aws_lambda_function.cnm_response_task.arn}",
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
],
"Catch": [
{
"ErrorEquals": ["States.ALL"],
"ResultPath": "$.exception",
"Next": "WorkflowFailed"
}
],
"Next": "WorkflowSucceeded"
},
"CnmResponseFail": {
"Parameters": {
"cma": {
"event.$": "$",
"task_config": {
"OriginalCNM": "{$.meta.cnm}",
"CNMResponseStream": "{$.meta.cnmResponseStream}",
"region": "us-east-1",
"WorkflowException": "{$.exception}",
"cumulus_message": {
"outputs": [
{
"source": "{$}",
"destination": "{$.meta.cnmResponse}"
},
{
"source": "{$}",
"destination": "{$.payload}"
}
]
}
}
}
},
"Type": "Task",
"Resource": "${aws_lambda_function.cnm_response_task.arn}",
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
],
"Catch": [
{
"ErrorEquals": ["States.ALL"],
"ResultPath": "$.exception",
"Next": "WorkflowFailed"
}
],
"Next": "WorkflowFailed"
},
"WorkflowSucceeded": {
"Type": "Succeed"
},
"WorkflowFailed": {
"Type": "Fail",
"Cause": "Workflow failed"
}
}
}

The above results in a workflow which is visualized in the diagram below:

Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

Summary

Error handling should (mostly) be the domain of workflow configuration.

- + \ No newline at end of file diff --git a/docs/data-cookbooks/hello-world/index.html b/docs/data-cookbooks/hello-world/index.html index ba6de517a60..7899e7b4556 100644 --- a/docs/data-cookbooks/hello-world/index.html +++ b/docs/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
Version: v15.0.2

HelloWorld Workflow

Example task meant to be a sanity check/introduction to the Cumulus workflows.

Pre-Deployment Configuration

Workflow Configuration

A workflow definition can be found in the template repository hello_world_workflow module.

{
"Comment": "Returns Hello World",
"StartAt": "HelloWorld",
"States": {
"HelloWorld": {
"Parameters": {
"cma": {
"event.$": "$",
"task_config": {
"buckets": "{$.meta.buckets}",
"provider": "{$.meta.provider}",
"collection": "{$.meta.collection}"
}
}
},
"Type": "Task",
"Resource": "${module.cumulus.hello_world_task.task_arn}",
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
],
"End": true
}
}
}

Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

Task Configuration

The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

Execution

We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

{
"collection": { # collection values can be configured and found on the Collections page
"name": "${collection_name}",
"version": "${collection_version}"
},
"name": "helloworld_rule",
"provider": "${provider}", # found on the Providers page
"rule": {
"type": "onetime"
},
"state": "ENABLED",
"workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
}

Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

Output/Results

The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

Summary

Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

- + \ No newline at end of file diff --git a/docs/data-cookbooks/ingest-notifications/index.html b/docs/data-cookbooks/ingest-notifications/index.html index fa09f2cd82c..c7d0d2c0293 100644 --- a/docs/data-cookbooks/ingest-notifications/index.html +++ b/docs/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
Version: v15.0.2

Ingest Notification in Workflows

On deployment, an SQS queue and three SNS topics, one for executions, granules, and PDRs, are created and used for handling notification messages related to the workflow.

The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates the RDS database records for granules, executions, and PDRs. When the records are updated, messages are posted to the three SNS topics. This Lambda is invoked both when the workflow starts and when it reaches a terminal state (completion or failure).

Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

Sending SQS messages to report status

Publishing granule/PDR reports directly to the SQS queue

If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

output "stepfunction_event_reporter_queue_url" {
value = module.cumulus.stepfunction_event_reporter_queue_url
}

output "report_executions_sns_topic_arn" {
value = module.cumulus.report_executions_sns_topic_arn
}
output "report_granules_sns_topic_arn" {
value = module.cumulus.report_executions_sns_topic_arn
}
output "report_pdrs_sns_topic_arn" {
value = module.cumulus.report_pdrs_sns_topic_arn
}

Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

Outputs:
...
stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

  • /aws/lambda/<prefix>-sfEventSqsToDbRecords

In a workflow

As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

  "PdrStatusReport": {
"Parameters": {
"cma": {
"event.$": "$",
"ReplaceConfig": {
"FullMessage": true
},
"task_config": {
"cumulus_message": {
"input": "{$}"
}
}
}
},
"ResultPath": null,
"Type": "Task",
"Resource": "${sf_sqs_report_task_arn}",
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
],
"Catch": [
{
"ErrorEquals": [
"States.ALL"
],
"ResultPath": "$.exception",
"Next": "WorkflowFailed"
}
],
"Next": "WaitForSomeTime"
},

Subscribing additional listeners to SNS topics

Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

resource "aws_lambda_function" "test_lambda" {
function_name = "${var.prefix}-testLambda"
filename = "./testLambda.zip"
source_code_hash = filebase64sha256("./testLambda.zip")
handler = "index.handler"
role = module.cumulus.lambda_processing_role_arn
runtime = "nodejs10.x"
}

resource "aws_sns_topic_subscription" "test_lambda" {
topic_arn = module.cumulus.report_executions_sns_topic_arn
protocol = "lambda"
endpoint = aws_lambda_function.test_lambda.arn
}

resource "aws_lambda_permission" "test_lambda" {
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.test_lambda.arn
principal = "sns.amazonaws.com"
source_arn = module.cumulus.report_executions_sns_topic_arn
}

SNS message format

Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

Summary

Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

- + \ No newline at end of file diff --git a/docs/data-cookbooks/queue-post-to-cmr/index.html b/docs/data-cookbooks/queue-post-to-cmr/index.html index fd4a5048651..ac599a95f09 100644 --- a/docs/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
Version: v15.0.2

Queue PostToCmr

In this document, we walk through handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

Overview

The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

Diagram of workflow queueing

Ingest Workflow

The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

  "QueuePublishWorkflowStep": {
"Parameters": {
"cma": {
"event.$": "$",
"ReplaceConfig": {
"FullMessage": true
},
"task_config": {
"internalBucket": "{$.meta.buckets.internal.name}",
"stackName": "{$.meta.stack}",
"workflow": "{$.meta.workflow}",
"queueUrl": "${start_sf_queue_url}",
"provider": "{$.meta.provider}",
"collection": "{$.meta.collection}"
}
}
},
"Type": "Task",
"Resource": "${queue_workflow_task_arn}",
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
],
"Catch": [
{
"ErrorEquals": [
"States.ALL"
],
"ResultPath": "$.exception",
"Next": "WorkflowFailed"
}
],
"End": true
},

Publish Workflow

Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

  "Catch": [
{
"ErrorEquals": [
"CMRInternalError"
],
"Next": "RequeueWorkflow"
},
{
"ErrorEquals": [
"States.ALL"
],
"Next": "WorkflowFailed",
"ResultPath": "$.exception"
}
],

Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

{
"RequeueWorkflow": {
"Parameters": {
"cma": {
"event.$": "$",
"task_config": {
"buckets": "{$.meta.buckets}",
"distribution_endpoint": "{$.meta.distribution_endpoint}",
"workflow": "PublishGranuleQueue",
"queueUrl": "${start_sf_queue_url}",
"provider": "{$.meta.provider}",
"collection": "{$.meta.collection}"
}
}
},
"Type": "Task",
"Resource": "${queue_workflow_task_arn}",
"Catch": [
{
"ErrorEquals": [
"States.ALL"
],
"Next": "WorkflowFailed",
"ResultPath": "$.exception"
}
],
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
],
"End": true
}
}
- + \ No newline at end of file diff --git a/docs/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index 300a7bf2b16..535614951b7 100644 --- a/docs/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
Version: v15.0.2

Run Step Function Tasks in AWS Lambda or Docker

Overview

AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

When to use Lambda

You should use AWS Lambda whenever all of the following are true:

  • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
  • The lambda package is less than 50 MB in size, zipped.
  • The task consumes less than each of the following resources:
    • 3008 MB memory allocation
    • 512 MB disk storage (must be written to /tmp)
    • 15 minutes of execution time

See this page for a complete and up-to-date list of AWS Lambda limits.

If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

Step Function Activities and cumulus-ecs-task

Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

Example: Replacing AWS Lambda with a Docker container run on ECS

This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

  "QueueGranules": {
"Parameters": {
"cma": {
"event.$": "$",
"ReplaceConfig": {
"FullMessage": true
},
"task_config": {
"provider": "{$.meta.provider}",
"internalBucket": "{$.meta.buckets.internal.name}",
"stackName": "{$.meta.stack}",
"granuleIngestWorkflow": "${ingest_granule_workflow_name}",
"queueUrl": "{$.meta.queues.startSF}"
}
}
},
"Type": "Task",
"Resource": "${queue_granules_task_arn}",
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
],
"Catch": [
{
"ErrorEquals": [
"States.ALL"
],
"ResultPath": "$.exception",
"Next": "WorkflowFailed"
}
],
"End": true
},

Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

  • A aws_sfn_activity resource:
resource "aws_sfn_activity" "queue_granules" {
name = "${var.prefix}-QueueGranules"
}
  • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

module "queue_granules_service" {
source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

prefix = var.prefix
name = "QueueGranules"

cluster_arn = module.cumulus.ecs_cluster_arn
desired_count = 1
image = "cumuluss/cumulus-ecs-task:1.9.0"

cpu = 400
memory_reservation = 700

environment = {
AWS_DEFAULT_REGION = data.aws_region.current.name
}
command = [
"cumulus-ecs-task",
"--activityArn",
aws_sfn_activity.queue_granules.id,
"--lambdaArn",
module.cumulus.queue_granules_task.task_arn,
"--lastModified",
module.cumulus.queue_granules_task.last_modified_date
]
alarms = {
MemoryUtilizationHigh = {
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 1
metric_name = "MemoryUtilization"
statistic = "SampleCount"
threshold = 75
}
}
}

Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

  • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

"Resource": "${aws_sfn_activity.queue_granules.id}")`

If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

Final note

Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

- + \ No newline at end of file diff --git a/docs/data-cookbooks/sips-workflow/index.html b/docs/data-cookbooks/sips-workflow/index.html index f8dfbe94447..69d18335004 100644 --- a/docs/data-cookbooks/sips-workflow/index.html +++ b/docs/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

Screenshot of a Cumulus rule configuration

Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

DiscoverAndQueuePdrs Workflow

This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

  1. DiscoverPdrs - source
  2. QueuePdrs - source

Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

ParsePdr Workflow

The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

The lambdas below are included in the cumulus terraform module for use in your workflows:

  1. ParsePdr - source
  2. QueueGranules - source
  3. CheckStatus - source

Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

IngestGranule Workflow

The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

The lambdas below are included in the cumulus terraform module for use in your workflows:

  1. SyncGranule - source.
  2. CmrStep - source

Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

Summary

In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

- + \ No newline at end of file diff --git a/docs/data-cookbooks/throttling-queued-executions/index.html b/docs/data-cookbooks/throttling-queued-executions/index.html index 2b8a30b2a06..0731b731d91 100644 --- a/docs/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
Version: v15.0.2

Throttling queued executions

In this entry, we will walk through how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

We will also review the architecture of this feature and highlight some implementation notes.

Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

Implementing the queue

Create and deploy the queue

Add a new queue

In a .tf file for your Cumulus deployment, add a new SQS queue:

resource "aws_sqs_queue" "background_job_queue" {
name = "${var.prefix}-backgroundJobQueue"
receive_wait_time_seconds = 20
visibility_timeout_seconds = 60
}

Set maximum executions for the queue

Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

module "cumulus" {
# ... other variables

throttled_queues = [{
url = aws_sqs_queue.background_job_queue.id,
execution_limit = 5
}]
}

Setup consumer for the queue

Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
schedule_expression = "rate(1 minute)"
}

resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
input = jsonencode({
messageLimit = 500
queueUrl = aws_sqs_queue.background_job_queue.id
timeLimit = 60
})
}

resource "aws_lambda_permission" "background_job_queue_watcher" {
action = "lambda:InvokeFunction"
function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
}

Re-deploy your Cumulus application

Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

{
"cumulus_meta": {
"queueExecutionLimits": {
"<backgroundJobQueue_SQS_URL>": 5
}
}
}

Integrate your queue with workflows and/or rules

Integrate queue with queuing steps in workflows

For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

{
"QueueGranules": {
"Parameters": {
"cma": {
"event.$": "$",
"ReplaceConfig": {
"FullMessage": true
},
"task_config": {
"queueUrl": "${aws_sqs_queue.background_job_queue.id}",
"provider": "{$.meta.provider}",
"internalBucket": "{$.meta.buckets.internal.name}",
"stackName": "{$.meta.stack}",
"granuleIngestWorkflow": "${ingest_granule_workflow_name}"
}
}
}
}
}

Similarly, for a QueuePdrs step:

Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

{
"QueuePdrs": {
"Parameters": {
"cma": {
"event.$": "$",
"ReplaceConfig": {
"FullMessage": true
},
"task_config": {
"queueUrl": "${aws_sqs_queue.background_job_queue.id}",
"provider": "{$.meta.provider}",
"collection": "{$.meta.collection}",
"internalBucket": "{$.meta.buckets.internal.name}",
"stackName": "{$.meta.stack}",
"parsePdrWorkflow": "${parse_pdr_workflow_name}"
}
}
}
}
}

After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

Create/update a rule to use your new queue

Create or update a rule definition to include a queueUrl property that refers to your new queue:

{
"name": "s3_provider_rule",
"workflow": "DiscoverAndQueuePdrs",
"provider": "s3_provider",
"collection": {
"name": "MOD09GQ",
"version": "006"
},
"rule": {
"type": "onetime"
},
"state": "ENABLED",
"queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
}

After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

Architecture

Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

  • If the increment operation is successful, then the count was not at the maximum and an execution is started
  • If the increment operation fails, then the count was already at the maximum so no execution is started

Final notes

Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

  • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
  • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
  • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
- + \ No newline at end of file diff --git a/docs/data-cookbooks/tracking-files/index.html b/docs/data-cookbooks/tracking-files/index.html index b40740c8af6..5138ad5e0f7 100644 --- a/docs/data-cookbooks/tracking-files/index.html +++ b/docs/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
ancillary'VIEW RELATED INFORMATION'OnlineResource
data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
linkage'EXTENDED METADATA'OnlineResource
metadata'EXTENDED METADATA'OnlineResource
qa'EXTENDED METADATA'OnlineResource

Common Use Cases

This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

Configuring browse imagery:

{
"bucket": "public",
"regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
"type": "browse"
}

Configuring a documentation entry:

{
"bucket": "protected",
"regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
"type": "metadata"
}

Configuring other associated files (use types metadata or qa as appropriate):

{
"bucket": "protected",
"regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
"sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
"type": "qa"
}
- + \ No newline at end of file diff --git a/docs/deployment/api-gateway-logging/index.html b/docs/deployment/api-gateway-logging/index.html index e26d1d1a2bf..a6914abe774 100644 --- a/docs/deployment/api-gateway-logging/index.html +++ b/docs/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
Version: v15.0.2

API Gateway Logging

Enabling API Gateway Logging

In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

log_api_gateway_to_cloudwatch = true

This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

Configure Permissions for API Gateway Logging to CloudWatch

Instructions: Enabling Account Level Logging from API Gateway to CloudWatch

This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

  1. Create a policy document

    The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

    Save this snippet as apigateway-policy.json.

    {
    "Version": "2012-10-17",
    "Statement": [
    {
    "Sid": "",
    "Effect": "Allow",
    "Principal": {
    "Service": "apigateway.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
    }
    ]
    }
  2. Create an account role to act as ApiGateway and write to CloudWatchLogs

    NASA users in NGAP: be sure to use your account's permission boundary.

    aws iam create-role \
    --role-name ApiGatewayToCloudWatchLogs \
    [--permissions-boundary <permissionBoundaryArn>] \
    --assume-role-policy-document file://apigateway-policy.json

    Note the ARN of the returned role for the last step.

  3. Attach correct permissions to role

    Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

    aws iam attach-role-policy \
    --role-name ApiGatewayToCloudWatchLogs \
    --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"
  4. Update Account API Gateway settings with correct permissions

    Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

    aws apigateway update-account \
    --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

Configure API Gateway CloudWatch Logs Delivery

For details about configuring the API Gateway CloudWatch Logs delivery, see Configure Cloudwatch Logs Delivery.

- + \ No newline at end of file diff --git a/docs/deployment/choosing_configuring_rds/index.html b/docs/deployment/choosing_configuring_rds/index.html index 7c4a37d8209..918726dad27 100644 --- a/docs/deployment/choosing_configuring_rds/index.html +++ b/docs/deployment/choosing_configuring_rds/index.html @@ -5,7 +5,7 @@ Choosing and Configuration Your RDS Database | Cumulus Documentation - + @@ -36,7 +36,7 @@ using this module to create your RDS cluster, you can configure the autoscaling timeout action, the cluster minimum and maximum capacity, and more as seen in the supported variables for the module.

Unfortunately, Terraform currently doesn't allow specifying the autoscaling timeout itself, so that value will have to be manually configured in the AWS console or CLI.

- + \ No newline at end of file diff --git a/docs/deployment/cloudwatch-logs-delivery/index.html b/docs/deployment/cloudwatch-logs-delivery/index.html index a76adc1cf74..ec677e473a6 100644 --- a/docs/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
Version: v15.0.2

Configure Cloudwatch Logs Delivery

As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

Logs Sent

By default, the following logs will be sent to the destination when one is given.

  • Ingest logs
  • Async Operation logs
  • Thin Egress App API Gateway logs (if configured)

Additional Logs

If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

additional_log_groups_to_elk = {
"HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
"MyCustomTask" = "my-custom-task-log-group"
}
- + \ No newline at end of file diff --git a/docs/deployment/components/index.html b/docs/deployment/components/index.html index ddd6b6a62dc..0107f9e3380 100644 --- a/docs/deployment/components/index.html +++ b/docs/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

See the deployment documentation for a walk-through of creating resources for your remote state using an S3 backend.

- + \ No newline at end of file diff --git a/docs/deployment/create_bucket/index.html b/docs/deployment/create_bucket/index.html index 2500c4869c6..88713b56904 100644 --- a/docs/deployment/create_bucket/index.html +++ b/docs/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
Version: v15.0.2

Creating an S3 Bucket

Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

Command Line

Using the AWS Command Line Tool create-bucket s3api subcommand:

$ aws s3api create-bucket \
--bucket foobar-internal \
--region us-west-2 \
--create-bucket-configuration LocationConstraint=us-west-2
{
"Location": "/foobar-internal"
}

⚠️ Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

Repeat the above step for each bucket to be created.

Web Interface

If you prefer to use the AWS web interface instead of the command line, see AWS "Creating a Bucket" documentation.

- + \ No newline at end of file diff --git a/docs/deployment/cumulus_distribution/index.html b/docs/deployment/cumulus_distribution/index.html index 69ace1bcd3a..6d382883771 100644 --- a/docs/deployment/cumulus_distribution/index.html +++ b/docs/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
Version: v15.0.2

Using the Cumulus Distribution API

The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

Configuring a Cumulus Distribution Deployment

The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

These steps assume you're using the Cumulus Deployment Template but they can also be used for custom deployments.

To configure a deployment to use Cumulus Distribution:

  1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the "Cumulus Distribution Settings".
  2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
  3. Uncomment the Cumulus Distribution outputs in outputs.tf.
  4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

Cognito Application and User Credentials

The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

  • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
  • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
  • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

Cumulus Distribution URL

Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file and setting it to one of the following values (both are explained below):

  1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
  2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

The following subsections explain these approaches in turn.

Using Your Cumulus Distribution API Gateway URL as Your Distribution URL

Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development). Here is an outline of the required steps with details provided further below:

  1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
  2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
  3. Choose an open local port on your machine (we'll use 9000 in the following example)
  4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
  5. Redeploy Cumulus
  6. Add an entry to your /etc/hosts file
  7. Add a redirect URI to Cognito via the Cognito API
  8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
  9. Add a sample file to S3 to test downloading via Cognito

To create or import an existing key pair, you can use the AWS CLI (see AWS ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

key_name = "<name>"
cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

where:

  • <name> is the name of the key pair you just added to AWS
  • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
  • <port> is your open local port of choice (9000 is typically a good choice)

Once you save your variable changes, redeploy your cumulus-tf module.

While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

localhost <hostname>

Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

  • method: POST
  • base URL: the value of your csdap_host_url Terraform variable
  • path: /authclient/updateRedirectUri
  • username: the value of your csdap_client_id Terraform variable
  • password: the value of your csdap_client_password Terraform variable
  • headers: Content-Type='application/x-www-form-urlencoded'
  • body: redirect_uri=<cumulus_distribution_url>/login

where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

For reference, see the Cognito Authentication Service API.

Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

brew install --cask session-manager-plugin --no-quarantine

As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

  1. Determine your EC2 instance ID
  2. Connect to the NASA VPN
  3. Start an AWS SSM session
  4. Open an SSH tunnel
  5. Use a browser to navigate to your file

To determine your EC2 instance ID for your Cumulus deployment, run the follow command where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text

⚠️ IMPORTANT: Before proceeding with the remaining steps, make sure you're connected to the NASA VPN.

Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

If successful, you should see output similar to the following:

Starting session with SessionId: NGAPShApplicationDeveloper-***
Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
Waiting for connections...

In another terminal window, open a tunnel with port forwarding using your chosen port from above (e.g., 9000):

ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

where:

  • <port> is the open local port you chose earlier (e.g., 9000)
  • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, and then next enter a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

Once you're finished testing, clean up as follows:

  1. Stop your SSH tunnel (enter Ctrl-C)
  2. Stop your AWS SSM session (enter Ctrl-C)
  3. If you like, disconnect from the NASA VPN

While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

Using a CloudFront URL as Your Distribution URL

In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

  • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
  • A request to add the AWS account's VPC to the whitelist

Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

cumulus_distribution_url = <cloudfront_url>

In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

Redeploy Cumulus with your new/updated Terraform variables.

As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

S3 Bucket Mapping

An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

The configuration file is a simple JSON mapping of the form:

{
"daac-public-data-bucket": "/path/to/this/kind/of/data"
}

⚠️ Note: Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

Switching from the Thin Egress App to Cumulus Distribution

If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

There will also be downtime while your API gateway is updated.

- + \ No newline at end of file diff --git a/docs/deployment/index.html b/docs/deployment/index.html index e7be499ddd3..c4cac3f9918 100644 --- a/docs/deployment/index.html +++ b/docs/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -19,7 +19,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

Consider the sizing of your Cumulus instance when configuring your variables.

Choose a Distribution API

Cumulus can be configured to use either the Thin Egress App (TEA) or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

⚠️ IMPORTANT: If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

Configure the Thin Egress App

TEA can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the TEA documentation to configure distribution in your cumulus-tf deployment.

Configure the Cumulus Distribution API (Optional)

If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

Initialize Terraform

Follow the above instructions to initialize Terraform using terraform init3.

Deploy

Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

Outputs:

archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

⚠️ Note: Be sure to copy the redirect URLs because you will need them to update your Earthdata application.

Update Earthdata Application

Add the two redirect URLs to your EarthData login application by doing the following:

  1. Login to URS
  2. Under My Applications -> Application Administration -> use the edit icon of your application
  3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
    • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token
  4. Also add the Distribution url
    • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1
  5. You may delete the placeholder url you used to create the application

If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


Deploy Cumulus Dashboard

Dashboard Requirements

Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the Dashboard repository.

Prepare AWS

Create S3 Bucket for Dashboard:

  • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
  • Configure the bucket to host a website:
    • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
    • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
  • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
  • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

Install Dashboard

To install the Cumulus Dashboard, clone the repository into the root deploy directory and install dependencies with npm install:

  git clone https://github.com/nasa/cumulus-dashboard
cd cumulus-dashboard
nvm use
npm install

If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

Dashboard Versioning

By default, the master branch will be used for Dashboard deployments. The master branch of the repository contains the most recent stable release of the Cumulus Dashboard.

If you want to test unreleased changes to the Dashboard, use the develop branch.

Each release/version of the Dashboard will have a tag in the Dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

To checkout and install a specific version of the Dashboard:

  git fetch --tags
git checkout <version-number> # e.g. v1.2.0
nvm use
npm install

If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

Building the Dashboard

⚠️ Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

Build your dashboard from the Cumulus Dashboard repository root directory, cumulus-dashboard:

  APIROOT=<your_api_root> npm run build

Dashboard Deployment

Deploy your dashboard to S3 bucket from the cumulus-dashboard directory:

Using AWS CLI:

  aws s3 sync dist s3://<prefix>-dashboard --acl public-read

From the S3 Console:

  • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

You should be able to visit the Dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and log in with a user that you had previously configured for access.


Cumulus Instance Sizing

The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

Elasticsearch

The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

AWS provides documentation on calculating and configuring for sizing.

In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

EC2 Instances and Autoscaling Groups

EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

When configuring your ECS cluster consider:

  • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
  • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
  • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
  • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

Footnotes


  1. Run terraform init if:

    • This is the first time deploying the module
    • You have added any additional child modules, including Cumulus components
    • You have updated the source for any of the child modules

  2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

  3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

- + \ No newline at end of file diff --git a/docs/deployment/postgres_database_deployment/index.html b/docs/deployment/postgres_database_deployment/index.html index 7e37d36054d..cc3f3993869 100644 --- a/docs/deployment/postgres_database_deployment/index.html +++ b/docs/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 11 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

Use of this example involves:

  • Creating/configuring a Terraform module directory
  • Using Terraform to deploy resources to AWS

Requirements

Configuration/installation of this module requires the following:

  • Terraform
  • git
  • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
  • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

Needed Git Repositories

Assumptions

OS/Environment

The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

Terraform

This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

Aurora/RDS

This document also assumes some basic familiarity with PostgreSQL databases and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs and the Aurora Serverless V1 docs.

Prepare Deployment Repository

If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS Configuration.

Clone the cumulus-template-deploy repo and name appropriately for your organization:

  git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

We will return to configuring this repo and using it for deployment below.

Optional: Create a New Repository

Create a new repository on Github so that you can add your workflows and other modules to source control:

  git remote set-url origin https://github.com/<org>/<repository-name>
git push origin master

You can then add/commit changes as needed.

⚠️ Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


Prepare AWS Configuration

To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

--

Configure and Deploy the Module

When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc.

Tip: This module does not need to be re-deployed for each Core update.

These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

cd rds-cluster-tf/
cp terraform.tf.example terraform.tf
cp terraform.tfvars.example terraform.tfvars

In terraform.tf, configure the remote state settings by substituting the appropriate values for:

  • bucket
  • dynamodb_table
  • PREFIX (whatever prefix you've chosen for your deployment)

Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

Configuration Options

  • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
  • db_admin_username -- cluster database administration username. Defaults to postgres.
  • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
  • region -- defaults to us-east-1.
  • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
  • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
  • min_capacity -- the minimum ACUs the cluster will scale to
  • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

Provision User and User Database

If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

  • provision_user_database -- must be set to true. This configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
  • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
  • rds_user_password -- the value to set the user password to.
  • prefix -- this value will be used to set a unique identifier for the ProvisionDatabase lambda, as well as name the provisioned user/database.

Once configured, the module will deploy the lambda and run it on each provision thus creating the configured database (if it does not exist), updating the user password (if that value has been changed), and updating the output user database secret.

Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

⚠️ Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

Initialize Terraform

Run terraform init

You should see a similar output:

* provider.aws: version = "~> 2.32"

Terraform has been successfully initialized!

Deploy

Run terraform apply to deploy the resources.

⚠️ Caution: If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

terraform apply

An execution plan has been generated and is shown below.
Resource actions are indicated with the following symbols:
+ create

Terraform will perform the following actions:

# module.rds_cluster.aws_db_subnet_group.default will be created
+ resource "aws_db_subnet_group" "default" {
+ arn = (known after apply)
+ description = "Managed by Terraform"
+ id = (known after apply)
+ name = (known after apply)
+ name_prefix = "xxxxxxxxx"
+ subnet_ids = [
+ "subnet-xxxxxxxxx",
+ "subnet-xxxxxxxxx",
]
+ tags = {
+ "Deployment" = "xxxxxxxxx"
}
}

# module.rds_cluster.aws_rds_cluster.cumulus will be created
+ resource "aws_rds_cluster" "cumulus" {
+ apply_immediately = true
+ arn = (known after apply)
+ availability_zones = (known after apply)
+ backup_retention_period = 1
+ cluster_identifier = "xxxxxxxxx"
+ cluster_identifier_prefix = (known after apply)
+ cluster_members = (known after apply)
+ cluster_resource_id = (known after apply)
+ copy_tags_to_snapshot = false
+ database_name = "xxxxxxxxx"
+ db_cluster_parameter_group_name = (known after apply)
+ db_subnet_group_name = (known after apply)
+ deletion_protection = true
+ enable_http_endpoint = true
+ endpoint = (known after apply)
+ engine = "aurora-postgresql"
+ engine_mode = "serverless"
+ engine_version = "10.12"
+ final_snapshot_identifier = "xxxxxxxxx"
+ hosted_zone_id = (known after apply)
+ id = (known after apply)
+ kms_key_id = (known after apply)
+ master_password = (sensitive value)
+ master_username = "xxxxxxxxx"
+ port = (known after apply)
+ preferred_backup_window = "07:00-09:00"
+ preferred_maintenance_window = (known after apply)
+ reader_endpoint = (known after apply)
+ skip_final_snapshot = false
+ storage_encrypted = (known after apply)
+ tags = {
+ "Deployment" = "xxxxxxxxx"
}
+ vpc_security_group_ids = (known after apply)

+ scaling_configuration {
+ auto_pause = true
+ max_capacity = 4
+ min_capacity = 2
+ seconds_until_auto_pause = 300
+ timeout_action = "RollbackCapacityChange"
}
}

# module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
+ resource "aws_secretsmanager_secret" "rds_login" {
+ arn = (known after apply)
+ id = (known after apply)
+ name = (known after apply)
+ name_prefix = "xxxxxxxxx"
+ policy = (known after apply)
+ recovery_window_in_days = 30
+ rotation_enabled = (known after apply)
+ rotation_lambda_arn = (known after apply)
+ tags = {
+ "Deployment" = "xxxxxxxxx"
}

+ rotation_rules {
+ automatically_after_days = (known after apply)
}
}

# module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
+ resource "aws_secretsmanager_secret_version" "rds_login" {
+ arn = (known after apply)
+ id = (known after apply)
+ secret_id = (known after apply)
+ secret_string = (sensitive value)
+ version_id = (known after apply)
+ version_stages = (known after apply)
}

# module.rds_cluster.aws_security_group.rds_cluster_access will be created
+ resource "aws_security_group" "rds_cluster_access" {
+ arn = (known after apply)
+ description = "Managed by Terraform"
+ egress = (known after apply)
+ id = (known after apply)
+ ingress = (known after apply)
+ name = (known after apply)
+ name_prefix = "cumulus_rds_cluster_access_ingress"
+ owner_id = (known after apply)
+ revoke_rules_on_delete = false
+ tags = {
+ "Deployment" = "xxxxxxxxx"
}
+ vpc_id = "vpc-xxxxxxxxx"
}

# module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
+ resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
+ from_port = 5432
+ id = (known after apply)
+ protocol = "tcp"
+ security_group_id = (known after apply)
+ self = true
+ source_security_group_id = (known after apply)
+ to_port = 5432
+ type = "ingress"
}

Plan: 6 to add, 0 to change, 0 to destroy.

Do you want to perform these actions?
Terraform will perform the actions described above.
Only 'yes' will be accepted to approve.

Enter a value: yes

module.rds_cluster.aws_db_subnet_group.default: Creating...
module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

Then, after the resources are created:

Apply complete! Resources: X added, 0 changed, 0 destroyed.
Releasing state lock. This may take a few moments...

Outputs:

admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
admin_db_login_secret_version = xxxxxxxxx
rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
security_group_id = xxxxxxxxx
user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secrets required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

The content of each of these secrets are in the form:

{
"database": "postgres",
"dbClusterIdentifier": "clusterName",
"engine": "postgres",
"host": "xxx",
"password": "defaultPassword",
"port": 5432,
"username": "xxx"
}
  • database -- the PostgreSQL database used by the configured user
  • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
  • engine -- the Aurora/RDS database engine
  • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
  • password -- the database password
  • username -- the account username
  • port -- The database connection port, should always be 5432

Next Steps

The database cluster has been created/updated! From here you can continue to add additional user accounts, databases, and other database configuration.

- + \ No newline at end of file diff --git a/docs/deployment/share-s3-access-logs/index.html b/docs/deployment/share-s3-access-logs/index.html index 4aecac02549..69841dbb310 100644 --- a/docs/deployment/share-s3-access-logs/index.html +++ b/docs/deployment/share-s3-access-logs/index.html @@ -5,13 +5,13 @@ Share S3 Access Logs | Cumulus Documentation - +
Version: v15.0.2

Share S3 Access Logs

It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

S3 Replicator

The S3 Replicator is a Node.js package that contains a simple Lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

First ensure that you have enabled S3 Server Access Logging.

Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition as the example below:

module "s3-replicator" {
source = "<path to s3-replicator.zip>"
prefix = var.prefix
vpc_id = var.vpc_id
subnet_ids = var.subnet_ids
permissions_boundary = var.permissions_boundary_arn
source_bucket = var.s3_replicator_config.source_bucket
source_prefix = var.s3_replicator_config.source_prefix
target_bucket = var.s3_replicator_config.target_bucket
target_prefix = var.s3_replicator_config.target_prefix
}

The Terraform source package can be found on the Cumulus GitHub Release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

ESDIS Metrics

In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

Configure the S3 Replicator as described above using the target_bucket and target_prefix provided by the Metrics team.

The Metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

- + \ No newline at end of file diff --git a/docs/deployment/terraform-best-practices/index.html b/docs/deployment/terraform-best-practices/index.html index eef35546c51..e4366c30198 100644 --- a/docs/deployment/terraform-best-practices/index.html +++ b/docs/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

aws resourcegroupstaggingapi get-resources \
--query "ResourceTagMappingList[].ResourceARN" \
--tag-filters Key=Deployment,Values=PREFIX

Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

Configuring the Cumulus deployment: link Restoring a previous version: link

- + \ No newline at end of file diff --git a/docs/deployment/thin_egress_app/index.html b/docs/deployment/thin_egress_app/index.html index 1d9208ed524..83793e2efea 100644 --- a/docs/deployment/thin_egress_app/index.html +++ b/docs/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus Distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
Version: v15.0.2

Using the Thin Egress App for Cumulus Distribution

The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

Configuring a TEA Deployment

TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

Create a Secret for Signing Thin Egress App JWTs

The Thin Egress App uses JSON Web Tokens (JWTs) internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

Bucket_map.yaml

The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

The configuration file is a simple JSON mapping of the form:

{
"daac-public-data-bucket": "/path/to/this/kind/of/data"
}

⚠️ Note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

Optionally Configure a Custom Bucket Map

A simple config would look something like this:

bucket_map.yaml
MAP:
my-protected: my-protected
my-public: my-public

PUBLIC_BUCKETS:
- my-public

⚠️ Note: Your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

Optionally Configure Shared Variables

The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus Core example deployment code.

- + \ No newline at end of file diff --git a/docs/deployment/upgrade-readme/index.html b/docs/deployment/upgrade-readme/index.html index a072941fde5..b78b5b554cd 100644 --- a/docs/deployment/upgrade-readme/index.html +++ b/docs/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

Update Cumulus Dashboard

If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

- + \ No newline at end of file diff --git a/docs/development/forked-pr/index.html b/docs/development/forked-pr/index.html index d1073d5c954..871f7b7f922 100644 --- a/docs/development/forked-pr/index.html +++ b/docs/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
Version: v15.0.2

Issuing PR From Forked Repos

Fork the Repo

  • Fork the Cumulus repo
  • Create a new branch from the branch you'd like to contribute to
  • If an issue does't already exist, submit one (see above)

Create a Pull Request

Reviewing PRs from Forked Repos

Upon submission of a pull request, the Cumulus development team will review the code.

Once the code passes an initial review, the team will run the CI tests against the proposed update.

The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

  1. Create a new branch:

      git checkout -b from-<name-of-the-branch> master
  2. Push the new branch to GitHub

  3. Change the destination of the forked PR to the new branch that was just pushed

    Screenshot of Github interface showing how to change the base branch of a pull request

  4. After code review and approval, merge the forked PR to the new branch.

  5. Create a PR for the new branch to master.

  6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

- + \ No newline at end of file diff --git a/docs/development/integration-tests/index.html b/docs/development/integration-tests/index.html index 06688af016e..94e64c0ec2d 100644 --- a/docs/development/integration-tests/index.html +++ b/docs/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

- + \ No newline at end of file diff --git a/docs/development/quality-and-coverage/index.html b/docs/development/quality-and-coverage/index.html index 6fdf089f436..373252fc29b 100644 --- a/docs/development/quality-and-coverage/index.html +++ b/docs/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

To run linting on the markdown files, run npm run lint-md.

Audit

This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

To execute an audit, run npm run audit.

- + \ No newline at end of file diff --git a/docs/development/release/index.html b/docs/development/release/index.html index 6f857b2f1ba..c7b2d18ef57 100644 --- a/docs/development/release/index.html +++ b/docs/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -24,7 +24,7 @@ this is a backport and patch release on the 13.3.x series of releases. Updates that are included in the future will have a corresponding CHANGELOG entry in future releases..

Troubleshooting

Delete and regenerate the tag

To delete a published tag to re-tag, follow these steps:

  git tag -d vMAJOR.MINOR.PATCH
git push -d origin vMAJOR.MINOR.PATCH

e.g.:
git tag -d v9.1.0
git push -d origin v9.1.0
- + \ No newline at end of file diff --git a/docs/docs-how-to/index.html b/docs/docs-how-to/index.html index e72ac70a419..fc5139fe5df 100644 --- a/docs/docs-how-to/index.html +++ b/docs/docs-how-to/index.html @@ -5,7 +5,7 @@ Cumulus Documentation: How To's | Cumulus Documentation - + @@ -13,7 +13,7 @@
Version: v15.0.2

Cumulus Documentation: How To's

Cumulus Docs Installation

Run a Local Server

Environment variables DOCSEARCH_APP_ID, DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

git clone git@github.com:nasa/cumulus
cd cumulus
npm run docs-install
npm run docs-serve
note

docs-build will build the documents into website/build. docs-clear will clear the documents.

caution

Fix any broken links reported by Docusaurus if you see the following messages during build.

[INFO] Docusaurus found broken links!

Exhaustive list of all broken links found:

Cumulus Documentation

Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

Add a New Page and Sidebars

Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

---
id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
hide_title: false
---

Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

Versioning Docs

We lean heavily on Docusaurus for versioning. Their suggestions and walk-through can be found here. Docusaurus v2 uses snapshot approach for documentation versioning. Every versioned docs does not depends on other version. It is worth noting that we would like the Documentation versions to match up directly with release versions. However, a new versioned docs can take up a lot of repo space and require maintenance, we suggest to update existing versioned docs for minor releases when there are no significant functionality changes. Cumulus versioning is explained in the Versioning Docs.

Search on our documentation site is taken care of by DocSearch. We have been provided with an apiId, apiKey and an indexName by DocSearch that we include in our website/docusaurus.config.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for these values to exist - DOCSEARCH_APP_ID, DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

Add a new task

The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

Read more about adding a new task.

Editing the tasks.md header or template

Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

Editing diagrams

For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

  • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

Deployment

The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

- + \ No newline at end of file diff --git a/docs/external-contributions/index.html b/docs/external-contributions/index.html index bb56f11ff45..987925d3f4b 100644 --- a/docs/external-contributions/index.html +++ b/docs/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
Version: v15.0.2

External Contributions

Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

Distribution

The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

Operational Cloud Recovery Archive (ORCA)

ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

Workflow Tasks

CNM

PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

DMR++ Generation

GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

- + \ No newline at end of file diff --git a/docs/faqs/index.html b/docs/faqs/index.html index 31b36486058..f42a0636bdc 100644 --- a/docs/faqs/index.html +++ b/docs/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
Version: v15.0.2

Frequently Asked Questions

Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

General | Workflows | Integrators & Developers | Operators


General

What prerequisites are needed to setup Cumulus?
Answer: Here is a list of the tools and access that you will need in order to get started. To maintain the up-to-date versions that we are using please visit our [Cumulus main README](https://github.com/nasa/cumulus) for details.
  • NVM for node versioning
  • AWS CLI
  • Bash
  • Docker (only required for testing)
  • docker-compose (only required for testing pip install docker-compose)
  • Python
  • pipenv

Keep in mind you will need access to the AWS console and an Earthdata account before you can deploy Cumulus.

What is the preferred web browser for the Cumulus environment?

Answer: Our preferred web browser is the latest version of Google Chrome.

How do I deploy a new instance in Cumulus?

Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

Where can I find Cumulus release notes?

Answer: To get the latest information about updates to Cumulus go to Cumulus Versions.

How do I quickly troubleshoot an issue in Cumulus?

Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

Where can I get support help?

Answer: The following options are available for assistance:

  • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a Cumulus JIRA ticket.
  • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

For more information on how to submit an issue or contribute to Cumulus follow our guidelines at Contributing


Workflows

What is a Cumulus workflow?

Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting the Workflows section.

How do I set up a Cumulus workflow?

Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

Where can I find a list of workflow tasks?

Answer: You can access a list of reusable tasks for Cumulus development at Cumulus Tasks.

Are there any third-party workflows or applications that I can use with Cumulus?

Answer: The Cumulus team works with various partners to help build a robust framework. You can visit our External Contributions section to see what other options are available to help you customize Cumulus for your needs.


Integrators & Developers

What is a Cumulus integrator?

Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

  • Configure and deploy Cumulus to the AWS environment
  • Configure Cumulus workflows
  • Write custom workflow tasks
What are the steps if I run into an issue during deployment?

Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

Is Cumulus customizable and flexible?

Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

What are Terraform modules?

Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

Where do I find Terraform module variables

Answer: Go here for a list of Cumulus maintained variables.

What are the common use cases that a Cumulus integrator encounters?

Answer: The following are some examples of possible use cases you may see:


Operators

What is a Cumulus operator?

Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

  • Ingesting datasets
  • Maintaining historical data ingest
  • Starting and stopping data handlers
  • Managing collections
  • Managing provider definitions
  • Creating, enabling, and disabling rules
  • Investigating errors for granules and deleting or re-ingesting granules
  • Investigating errors in executions and isolating failed workflow step(s)
What are the common use cases that a Cumulus operator encounters?

Answer: The following are some examples of possible use cases you may see:

Explore more Cumulus operator best practices and how-tos in the dedicated Operator Docs.

Can you re-run a workflow execution in AWS?

Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

- + \ No newline at end of file diff --git a/docs/features/ancillary_metadata/index.html b/docs/features/ancillary_metadata/index.html index f2a6a38b869..6c039356e4a 100644 --- a/docs/features/ancillary_metadata/index.html +++ b/docs/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
Version: v15.0.2

Ancillary Metadata Export

This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

Tasks setting type

Discover Granules

Uses the Collection type key to set the value for files on discovered granules in it's output.

Parse PDR

Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

CNMToCMALambdaFunction

Natively supports types that are included in incoming messages to a CNM Workflow.

Tasks using type

Move Granules

Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

- + \ No newline at end of file diff --git a/docs/features/backup_and_restore/index.html b/docs/features/backup_and_restore/index.html index 66cc8539c70..8cd756930e3 100644 --- a/docs/features/backup_and_restore/index.html +++ b/docs/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -52,7 +52,7 @@ writing to the old cluster.

  • Set the snapshot_identifier variable to the snapshot you wish to create, and configure the module like a new deployment, with a unique cluster_identifier

  • Deploy the module using terraform apply

  • Once deployed, verify the cluster has the expected data

  • Redeploy the data persistence and Cumulus deployments - You should not need to reconfigure either, as the secret ARN and the security group should not change, however double-check the configured values are as expected

  • - + \ No newline at end of file diff --git a/docs/features/dead_letter_archive/index.html b/docs/features/dead_letter_archive/index.html index 0510f981b8f..e89a1dcdc9e 100644 --- a/docs/features/dead_letter_archive/index.html +++ b/docs/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: v15.0.2

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords. Otherwise, all Cumulus messages that fail to be reprocessed will be moved to a new archive location under the path <stackName>/dead-letter-archive/failed-sqs/<YYYY-MM-DD>.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/features/dead_letter_queues/index.html b/docs/features/dead_letter_queues/index.html index 2d59a3b79fe..14e8a084a1f 100644 --- a/docs/features/dead_letter_queues/index.html +++ b/docs/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v15.0.2

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/features/distribution-metrics/index.html b/docs/features/distribution-metrics/index.html index 39336fec37d..f383a44148b 100644 --- a/docs/features/distribution-metrics/index.html +++ b/docs/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v15.0.2

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/features/execution_payload_retention/index.html b/docs/features/execution_payload_retention/index.html index dcfe0f06f35..edf353fe04f 100644 --- a/docs/features/execution_payload_retention/index.html +++ b/docs/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v15.0.2

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in your RDS database and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/features/logging-esdis-metrics/index.html b/docs/features/logging-esdis-metrics/index.html index a006e3f8e88..538cb2dc085 100644 --- a/docs/features/logging-esdis-metrics/index.html +++ b/docs/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v15.0.2

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/features/replay-archived-sqs-messages/index.html b/docs/features/replay-archived-sqs-messages/index.html index 74e6edfbcaa..c929625e15d 100644 --- a/docs/features/replay-archived-sqs-messages/index.html +++ b/docs/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: v15.0.2

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/features/replay-kinesis-messages/index.html b/docs/features/replay-kinesis-messages/index.html index cd416ac68ad..d4ec15bd6a8 100644 --- a/docs/features/replay-kinesis-messages/index.html +++ b/docs/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v15.0.2

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/features/reports/index.html b/docs/features/reports/index.html index d8f6b6f1f98..b16b25cef32 100644 --- a/docs/features/reports/index.html +++ b/docs/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -19,7 +19,7 @@ report generation. The data buckets will include any buckets in your Cumulus buckets configuration that have type public, protected or private.
    - + \ No newline at end of file diff --git a/docs/getting-started/index.html b/docs/getting-started/index.html index 38f007b52e0..b96c7a530ef 100644 --- a/docs/getting-started/index.html +++ b/docs/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v15.0.2

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Set up Git Secrets

    To ensure your AWS access keys and passwords are protected as you submit commits we recommend setting up Git Secrets.

    2. Deploy Cumulus Core and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    3. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    4. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment.

    For an introduction about Terraform go here.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/glossary/index.html b/docs/glossary/index.html index 938f9f415a7..bb11d6dded8 100644 --- a/docs/glossary/index.html +++ b/docs/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: v15.0.2

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: Amazon Web Services documentation.

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see Collections - Data Management Types.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page.

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Node Package Manager (npm)

    Node package manager. Often referred to as npm.

    For more information, see npm.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (npm)

    Npm hosted node.js packages. Cumulus packages can be found on npm's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data, and more.

    For more information, see AWS's S3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform.

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index a6ae276ae37..e5e3b1f6f97 100644 --- a/docs/index.html +++ b/docs/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v15.0.2

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/integrator-guide/about-int-guide/index.html b/docs/integrator-guide/about-int-guide/index.html index f73cbbf0b9f..a17e5adbc9f 100644 --- a/docs/integrator-guide/about-int-guide/index.html +++ b/docs/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v15.0.2

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/integrator-guide/int-common-use-cases/index.html b/docs/integrator-guide/int-common-use-cases/index.html index 0c6d01489d2..8834983ca16 100644 --- a/docs/integrator-guide/int-common-use-cases/index.html +++ b/docs/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/integrator-guide/workflow-add-new-lambda/index.html b/docs/integrator-guide/workflow-add-new-lambda/index.html index 98333dc8f1e..996d1a32435 100644 --- a/docs/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v15.0.2

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/integrator-guide/workflow-ts-failed-step/index.html b/docs/integrator-guide/workflow-ts-failed-step/index.html index 4f165fe5645..56b0665c516 100644 --- a/docs/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v15.0.2

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/interfaces/index.html b/docs/interfaces/index.html index 86e433882ff..d8ca78cce42 100644 --- a/docs/interfaces/index.html +++ b/docs/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v15.0.2

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/next/adding-a-task/index.html b/docs/next/adding-a-task/index.html index 81a58ebd428..5084daddb25 100644 --- a/docs/next/adding-a-task/index.html +++ b/docs/next/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: Next

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    caution

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/next/api/index.html b/docs/next/api/index.html index 13ddab2d709..ec77f13b303 100644 --- a/docs/next/api/index.html +++ b/docs/next/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/architecture/index.html b/docs/next/architecture/index.html index 459737cc72d..1566946a682 100644 --- a/docs/next/architecture/index.html +++ b/docs/next/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: Next

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a PostgreSQL compatible database, and is exported to an Elasticsearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Database

    Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance.

    PostgreSQL Database Schema Diagram

    ERD of the Cumulus Database

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/next/category/about-cumulus/index.html b/docs/next/category/about-cumulus/index.html index 5edc6e4b39a..04d46401770 100644 --- a/docs/next/category/about-cumulus/index.html +++ b/docs/next/category/about-cumulus/index.html @@ -5,13 +5,13 @@ About Cumulus | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/common-use-cases/index.html b/docs/next/category/common-use-cases/index.html index 5ff19047ad8..0cd8d0f7281 100644 --- a/docs/next/category/common-use-cases/index.html +++ b/docs/next/category/common-use-cases/index.html @@ -5,13 +5,13 @@ Common Use Cases | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/configuration-1/index.html b/docs/next/category/configuration-1/index.html index 68ee6b1f09d..7eed2326cef 100644 --- a/docs/next/category/configuration-1/index.html +++ b/docs/next/category/configuration-1/index.html @@ -5,13 +5,13 @@ Configuration | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/configuration/index.html b/docs/next/category/configuration/index.html index bf70dc33a9b..a69ad47ddf1 100644 --- a/docs/next/category/configuration/index.html +++ b/docs/next/category/configuration/index.html @@ -5,13 +5,13 @@ Configuration | Cumulus Documentation - +
    Version: Next

    Configuration

    - + \ No newline at end of file diff --git a/docs/next/category/cookbooks/index.html b/docs/next/category/cookbooks/index.html index cfcda9c2a34..f993643a731 100644 --- a/docs/next/category/cookbooks/index.html +++ b/docs/next/category/cookbooks/index.html @@ -5,13 +5,13 @@ Cookbooks | Cumulus Documentation - +
    Version: Next

    Cookbooks

    - + \ No newline at end of file diff --git a/docs/next/category/cumulus-development/index.html b/docs/next/category/cumulus-development/index.html index 6c5bca7cf3b..1455a8fd828 100644 --- a/docs/next/category/cumulus-development/index.html +++ b/docs/next/category/cumulus-development/index.html @@ -5,13 +5,13 @@ Cumulus Development | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/deployment/index.html b/docs/next/category/deployment/index.html index ccbee0fb610..0d34a89e923 100644 --- a/docs/next/category/deployment/index.html +++ b/docs/next/category/deployment/index.html @@ -5,13 +5,13 @@ Cumulus Deployment | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/development/index.html b/docs/next/category/development/index.html index b161777b76d..8331eec0aee 100644 --- a/docs/next/category/development/index.html +++ b/docs/next/category/development/index.html @@ -5,13 +5,13 @@ Development | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/external-contributions/index.html b/docs/next/category/external-contributions/index.html index 30398103ad8..5a73247802d 100644 --- a/docs/next/category/external-contributions/index.html +++ b/docs/next/category/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/features/index.html b/docs/next/category/features/index.html index 9826a62c220..fe98e4016be 100644 --- a/docs/next/category/features/index.html +++ b/docs/next/category/features/index.html @@ -5,13 +5,13 @@ Features | Cumulus Documentation - +
    Version: Next

    Features

    📄️ How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    - + \ No newline at end of file diff --git a/docs/next/category/getting-started/index.html b/docs/next/category/getting-started/index.html index 560a0e84f7f..668a13857cb 100644 --- a/docs/next/category/getting-started/index.html +++ b/docs/next/category/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/integrator-guide/index.html b/docs/next/category/integrator-guide/index.html index ba98feea8a6..34f1d2f252b 100644 --- a/docs/next/category/integrator-guide/index.html +++ b/docs/next/category/integrator-guide/index.html @@ -5,13 +5,13 @@ Integrator Guide | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/logs/index.html b/docs/next/category/logs/index.html index a5968d94e88..adf63e69688 100644 --- a/docs/next/category/logs/index.html +++ b/docs/next/category/logs/index.html @@ -5,13 +5,13 @@ Logs | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/operations/index.html b/docs/next/category/operations/index.html index 5bcc61b3fbe..bd30adafc7c 100644 --- a/docs/next/category/operations/index.html +++ b/docs/next/category/operations/index.html @@ -5,13 +5,13 @@ Operations | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/troubleshooting/index.html b/docs/next/category/troubleshooting/index.html index 75e9dab774c..2a0432c6abe 100644 --- a/docs/next/category/troubleshooting/index.html +++ b/docs/next/category/troubleshooting/index.html @@ -5,13 +5,13 @@ Troubleshooting | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/upgrade-notes/index.html b/docs/next/category/upgrade-notes/index.html index 7ac6cb43b1f..71659f7289b 100644 --- a/docs/next/category/upgrade-notes/index.html +++ b/docs/next/category/upgrade-notes/index.html @@ -5,13 +5,13 @@ Upgrade Notes | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/category/workflow-tasks/index.html b/docs/next/category/workflow-tasks/index.html index ff4bd991ded..3660e8bba41 100644 --- a/docs/next/category/workflow-tasks/index.html +++ b/docs/next/category/workflow-tasks/index.html @@ -5,13 +5,13 @@ Workflow Tasks | Cumulus Documentation - +
    Version: Next

    Workflow Tasks

    - + \ No newline at end of file diff --git a/docs/next/category/workflows/index.html b/docs/next/category/workflows/index.html index 674a8624686..f12cf6dd186 100644 --- a/docs/next/category/workflows/index.html +++ b/docs/next/category/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/configuration/cloudwatch-retention/index.html b/docs/next/configuration/cloudwatch-retention/index.html index 0b5baa07218..c66a25e0f9a 100644 --- a/docs/next/configuration/cloudwatch-retention/index.html +++ b/docs/next/configuration/cloudwatch-retention/index.html @@ -5,7 +5,7 @@ Cloudwatch Retention | Cumulus Documentation - + @@ -14,7 +14,7 @@ the retention period (in days) of cloudwatch log groups for lambdas and tasks which the cumulus, cumulus_distribution, and cumulus_ecs_service modules supports (using the cumulus module as an example):

    module "cumulus" {
    # ... other variables
    default_log_retention_days = var.default_log_retention_days
    cloudwatch_log_retention_periods = var.cloudwatch_log_retention_periods
    }

    By setting the below variables in terraform.tfvars and deploying, the cloudwatch log groups will be instantiated or updated with the new retention value.

    default_log_retention_periods

    The variable default_log_retention_days can be configured in order to set the default log retention for all cloudwatch log groups managed by Cumulus in case a custom value isn't used. The log groups will use this value for their retention, and if this value is not set either, the retention will default to 30 days. For example, if a user would like their log groups of the Cumulus module to have a retention period of one year, deploy the respective modules with the variable in the example below.

    Example

    default_log_retention_periods = 365

    cloudwatch_log_retention_periods

    The retention period (in days) of cloudwatch log groups for specific lambdas and tasks can be set during deployment using the cloudwatch_log_retention_periods terraform map variable. In order to configure these values for respective cloudwatch log groups, uncomment the cloudwatch_log_retention_periods variable and add the retention values listed below corresponding to the group's retention you want to change. The following values are supported correlating to their lambda/task name, (i.e. "/aws/lambda/prefix-DiscoverPdrs" would have the retention variable "DiscoverPdrs" )

    • ApiEndpoints
    • AsyncOperationEcsLogs
    • DiscoverPdrs
    • DistributionApiEndpoints
    • EcsLogs
    • granuleFilesCacheUpdater
    • HyraxMetadataUpdates
    • ParsePdr
    • PostToCmr
    • PrivateApiLambda
    • publishExecutions
    • publishGranules
    • publishPdrs
    • QueuePdrs
    • QueueWorkflow
    • replaySqsMessages
    • SyncGranule
    • UpdateCmrAccessConstraints
    note

    EcsLogs is used for all cumulus_ecs_service tasks cloudwatch log groups

    Example

    cloudwatch_log_retention_periods = {
    ParsePdr = 365
    }

    The retention periods are the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    - + \ No newline at end of file diff --git a/docs/next/configuration/collection-storage-best-practices/index.html b/docs/next/configuration/collection-storage-best-practices/index.html index 05d68e6a7aa..f621dab50a5 100644 --- a/docs/next/configuration/collection-storage-best-practices/index.html +++ b/docs/next/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: Next

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/next/configuration/data-management-types/index.html b/docs/next/configuration/data-management-types/index.html index 1005d40a6e7..5d76a09a0a4 100644 --- a/docs/next/configuration/data-management-types/index.html +++ b/docs/next/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: Next

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    note

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    note
    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certificateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certiciateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22
    privateKeystringNofilename assumed to be in s3://bucketInternal/stackName/crypto
    cmKeyIdstringNoAWS KMS Customer Master Key arn or alias

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.fileName, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.fileName, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    state field conditional

    If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/next/configuration/lifecycle-policies/index.html b/docs/next/configuration/lifecycle-policies/index.html index d81c1c87a8a..51d28602394 100644 --- a/docs/next/configuration/lifecycle-policies/index.html +++ b/docs/next/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: Next

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walk-through on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    caution

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/next/configuration/monitoring-readme/index.html b/docs/next/configuration/monitoring-readme/index.html index aeecb7f0a25..b72d7c20505 100644 --- a/docs/next/configuration/monitoring-readme/index.html +++ b/docs/next/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: Next

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return await Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/next/configuration/server_access_logging/index.html b/docs/next/configuration/server_access_logging/index.html index 0d197adeab8..bd884e43eb8 100644 --- a/docs/next/configuration/server_access_logging/index.html +++ b/docs/next/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: Next

    S3 Server Access Logging

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/next/configuration/task-configuration/index.html b/docs/next/configuration/task-configuration/index.html index 5d3380a78bb..64227beae07 100644 --- a/docs/next/configuration/task-configuration/index.html +++ b/docs/next/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
    Version: Next

    Configuration of Tasks

    The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

    cmr_search_client_config

    Configuration parameters for CMR search client for cumulus archive module tasks in the form:

    <lambda_identifier>_report_cmr_limit = <maximum number records can be returned from cmr-client search, this should be greater than cmr_page_size>
    <lambda_identifier>_report_cmr_page_size = <number of records for each page returned from CMR>
    type = map(string)

    More information about cmr limit and cmr page_size can be found from @cumulus/cmr-client and CMR Search API document.

    Currently the following values are supported:

    • create_reconciliation_report_cmr_limit
    • create_reconciliation_report_cmr_page_size

    Example

    cmr_search_client_config = {
    create_reconciliation_report_cmr_limit = 2500
    create_reconciliation_report_cmr_page_size = 250
    }

    elasticsearch_client_config

    Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

    <lambda_identifier>_es_scroll_duration = <duration>
    <lambda_identifier>_es_scroll_size = <size>
    type = map(string)

    Currently the following values are supported:

    • create_reconciliation_report_es_scroll_duration
    • create_reconciliation_report_es_scroll_size

    Example

    elasticsearch_client_config = {
    create_reconciliation_report_es_scroll_duration = "15m"
    create_reconciliation_report_es_scroll_size = 2000
    }

    lambda_timeouts

    A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_timeout: <timeout>
    type = map(string)

    Currently the following values are supported:

    • add_missing_file_checksums_task_timeout
    • discover_granules_task_timeout
    • discover_pdrs_task_timeout
    • fake_processing_task_timeout
    • files_to_granules_task_timeout
    • hello_world_task_timeout
    • hyrax_metadata_update_tasks_timeout
    • lzards_backup_task_timeout
    • move_granules_task_timeout
    • parse_pdr_task_timeout
    • pdr_status_check_task_timeout
    • post_to_cmr_task_timeout
    • queue_granules_task_timeout
    • queue_pdrs_task_timeout
    • queue_workflow_task_timeout
    • sf_sqs_report_task_timeout
    • sync_granule_task_timeout
    • update_granules_cmr_metadata_file_links_task_timeout

    Example

    lambda_timeouts = {
    discover_granules_task_timeout = 300
    }

    lambda_memory_sizes

    A configurable map of memory sizes (in MBs) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_memory_size: <memory_size>
    type = map(string)

    Currently the following values are supported:

    • add_missing_file_checksums_task_memory_size
    • discover_granules_task_memory_size
    • discover_pdrs_task_memory_size
    • fake_processing_task_memory_size
    • hyrax_metadata_updates_task_memory_size
    • lzards_backup_task_memory_size
    • move_granules_task_memory_size
    • parse_pdr_task_memory_size
    • pdr_status_check_task_memory_size
    • post_to_cmr_task_memory_size
    • queue_granules_task_memory_size
    • queue_pdrs_task_memory_size
    • queue_workflow_task_memory_size
    • sf_sqs_report_task_memory_size
    • sync_granule_task_memory_size
    • update_cmr_acess_constraints_task_memory_size
    • update_granules_cmr_metadata_file_links_task_memory_size

    Example

    lambda_memory_sizes = {
    queue_granules_task_memory_size = 1036
    }
    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/about-cookbooks/index.html b/docs/next/data-cookbooks/about-cookbooks/index.html index e44fd7f233e..110c6df52a8 100644 --- a/docs/next/data-cookbooks/about-cookbooks/index.html +++ b/docs/next/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: Next

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to configure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/browse-generation/index.html b/docs/next/data-cookbooks/browse-generation/index.html index e9fb5d18bc5..b3407141130 100644 --- a/docs/next/data-cookbooks/browse-generation/index.html +++ b/docs/next/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule with an ENABLED state, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg"
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/choice-states/index.html b/docs/next/data-cookbooks/choice-states/index.html index 61e019dbe6a..78dff76c466 100644 --- a/docs/next/data-cookbooks/choice-states/index.html +++ b/docs/next/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: Next

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/cnm-workflow/index.html b/docs/next/data-cookbooks/cnm-workflow/index.html index 3c6c6d238b4..7fd223570b2 100644 --- a/docs/next/data-cookbooks/cnm-workflow/index.html +++ b/docs/next/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: Next

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/cnm_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    note

    To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }
    note
    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    note

    This example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json
    note

    The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    info

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/error-handling/index.html b/docs/next/data-cookbooks/error-handling/index.html index 49a72f57289..6875fb73bad 100644 --- a/docs/next/data-cookbooks/error-handling/index.html +++ b/docs/next/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/hello-world/index.html b/docs/next/data-cookbooks/hello-world/index.html index 0c0a0db59d6..1eb90642e2f 100644 --- a/docs/next/data-cookbooks/hello-world/index.html +++ b/docs/next/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: Next

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/ingest-notifications/index.html b/docs/next/data-cookbooks/ingest-notifications/index.html index 441a530baaf..dacfd88a6e7 100644 --- a/docs/next/data-cookbooks/ingest-notifications/index.html +++ b/docs/next/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: Next

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics, one for executions, granules, and PDRs, are created and used for handling notification messages related to the workflow.

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates the RDS database records for granules, executions, and PDRs. When the records are updated, messages are posted to the three SNS topics. This Lambda is invoked both when the workflow starts and when it reaches a terminal state (completion or failure).

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    info

    ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/queue-post-to-cmr/index.html b/docs/next/data-cookbooks/queue-post-to-cmr/index.html index a4d367db94c..764aa0dfbd7 100644 --- a/docs/next/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/next/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: Next

    Queue PostToCmr

    In this document, we walk through handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/next/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index 5267e8176e9..1e4cbf45e08 100644 --- a/docs/next/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/next/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: Next

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time
    info

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    interpolated values

    ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.9.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn,
    "--lastModified",
    module.cumulus.queue_granules_task.last_modified_date
    ]
    alarms = {
    MemoryUtilizationHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 75
    }
    }
    }
    note

    If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/sips-workflow/index.html b/docs/next/data-cookbooks/sips-workflow/index.html index 40391228838..13ea584a220 100644 --- a/docs/next/data-cookbooks/sips-workflow/index.html +++ b/docs/next/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    note

    A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    note

    To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    note

    To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    tip

    Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    note

    To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/throttling-queued-executions/index.html b/docs/next/data-cookbooks/throttling-queued-executions/index.html index 078148f24eb..8f00afbe659 100644 --- a/docs/next/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/next/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: Next

    Throttling queued executions

    In this entry, we will walk through how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    caution

    You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    ingest_granule_workflow_name

    ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    parse_pdr_workflow_name

    ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/next/data-cookbooks/tracking-files/index.html b/docs/next/data-cookbooks/tracking-files/index.html index 669d30dfa2d..217b462c038 100644 --- a/docs/next/data-cookbooks/tracking-files/index.html +++ b/docs/next/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/next/deployment/api-gateway-logging/index.html b/docs/next/deployment/api-gateway-logging/index.html index 741521707f2..0f4d35d15e9 100644 --- a/docs/next/deployment/api-gateway-logging/index.html +++ b/docs/next/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: Next

    API Gateway Logging

    Enabling API Gateway Logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions: Enabling Account Level Logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    1. Create a policy document

      The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

      Save this snippet as apigateway-policy.json.

      {
      "Version": "2012-10-17",
      "Statement": [
      {
      "Sid": "",
      "Effect": "Allow",
      "Principal": {
      "Service": "apigateway.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
      }
      ]
      }
    2. Create an account role to act as ApiGateway and write to CloudWatchLogs

      in NGAP

      NASA users in NGAP: Be sure to use your account's permission boundary.

          aws iam create-role \
      --role-name ApiGatewayToCloudWatchLogs \
      [--permissions-boundary <permissionBoundaryArn>] \
      --assume-role-policy-document file://apigateway-policy.json

      Note the ARN of the returned role for the last step.

    3. Attach correct permissions to role

      Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

      aws iam attach-role-policy \
      --role-name ApiGatewayToCloudWatchLogs \
      --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"
    4. Update Account API Gateway settings with correct permissions

      Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

      aws apigateway update-account \
      --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    For details about configuring the API Gateway CloudWatch Logs delivery, see Configure Cloudwatch Logs Delivery.

    - + \ No newline at end of file diff --git a/docs/next/deployment/apis-introduction/index.html b/docs/next/deployment/apis-introduction/index.html index 5ac0d0b0493..75dc3f30379 100644 --- a/docs/next/deployment/apis-introduction/index.html +++ b/docs/next/deployment/apis-introduction/index.html @@ -5,13 +5,13 @@ APIs | Cumulus Documentation - +
    Version: Next

    APIs

    Common Distribution APIs

    When deploying from the Cumulus Deployment Template or a configuration based on that repo, the Thin Egress App (TEA) distribution app will be used by default. However, you have the choice to use the Cumulus Distribution API as well.

    Cumulus API Customization Use Cases

    Our Cumulus API offers you the flexibility to customize for your DAAC/organization. Below is a list of use cases that may help you with options:

    Types of APIs

    - + \ No newline at end of file diff --git a/docs/next/deployment/choosing_configuring_rds/index.html b/docs/next/deployment/choosing_configuring_rds/index.html index c0257f5ebf1..da9ffeb6957 100644 --- a/docs/next/deployment/choosing_configuring_rds/index.html +++ b/docs/next/deployment/choosing_configuring_rds/index.html @@ -5,7 +5,7 @@ RDS: Choosing and Configuring Your Database Type | Cumulus Documentation - + @@ -36,7 +36,7 @@ using this module to create your RDS cluster, you can configure the autoscaling timeout action, the cluster minimum and maximum capacity, and more as seen in the supported variables for the module.

    Unfortunately, Terraform currently doesn't allow specifying the autoscaling timeout itself, so that value will have to be manually configured in the AWS console or CLI.

    Optional: Manage RDS Database with pgAdmin

    Setup SSM Port Forwarding

    note

    In order to perform this action you will need to deploy it within a VPC and have the credentials to access via NGAP protocols.

    For a walkthrough guide on how to utilize AWS's Session Manager for port forwarding to access the Cumulus RDS database go to the Accessing Cumulus RDS database via SSM Port Forwarding article.

    - + \ No newline at end of file diff --git a/docs/next/deployment/cloudwatch-logs-delivery/index.html b/docs/next/deployment/cloudwatch-logs-delivery/index.html index 2b2f0d363f2..f3cc1636107 100644 --- a/docs/next/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/next/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: Next

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    By default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/next/deployment/components/index.html b/docs/next/deployment/components/index.html index 9acbe6b282d..4890b878b27 100644 --- a/docs/next/deployment/components/index.html +++ b/docs/next/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walk-through of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/next/deployment/create_bucket/index.html b/docs/next/deployment/create_bucket/index.html index 9742dfa1900..828aefddc8a 100644 --- a/docs/next/deployment/create_bucket/index.html +++ b/docs/next/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: Next

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command Line

    Using the AWS Command Line Tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }
    info

    The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web Interface

    If you prefer to use the AWS web interface instead of the command line, see AWS "Creating a Bucket" documentation.

    - + \ No newline at end of file diff --git a/docs/next/deployment/cumulus_distribution/index.html b/docs/next/deployment/cumulus_distribution/index.html index 8647efae4be..c2ed7f4374d 100644 --- a/docs/next/deployment/cumulus_distribution/index.html +++ b/docs/next/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
    Version: Next

    Using the Cumulus Distribution API

    The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

    tip

    If you need to access our quick reference materials while setting up or continuing to manage your API access go to the Cumulus Distribution API Docs.

    Configuring a Cumulus Distribution Deployment

    The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

    These steps assume you're using the Cumulus Deployment Template but they can also be used for custom deployments.

    To configure a deployment to use Cumulus Distribution:

    1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the "Cumulus Distribution Settings".
    2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
    3. Uncomment the Cumulus Distribution outputs in outputs.tf.
    4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

    Cognito Application and User Credentials

    The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

    Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

    Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

    • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
    • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
    • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

    Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

    Cumulus Distribution URL

    Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

    By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

    In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

    You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file and setting it to one of the following values (both are explained below):

    1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
    2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

    The following subsections explain these approaches in turn.

    Using Your Cumulus Distribution API Gateway URL as Your Distribution URL

    Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development). Here is an outline of the required steps with details provided further below:

    1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
    2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
    3. Choose an open local port on your machine (we'll use 9000 in the following example)
    4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
    5. Redeploy Cumulus
    6. Add an entry to your /etc/hosts file
    7. Add a redirect URI to Cognito via the Cognito API
    8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
    9. Add a sample file to S3 to test downloading via Cognito

    To create or import an existing key pair, you can use the AWS CLI (see AWS ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

    Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

    key_name = "<name>"
    cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

    where:

    • <name> is the name of the key pair you just added to AWS
    • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
    • <port> is your open local port of choice (9000 is typically a good choice)

    Once you save your variable changes, redeploy your cumulus-tf module.

    While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

    localhost <hostname>

    Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

    • method: POST
    • base URL: the value of your csdap_host_url Terraform variable
    • path: /authclient/updateRedirectUri
    • username: the value of your csdap_client_id Terraform variable
    • password: the value of your csdap_client_password Terraform variable
    • headers: Content-Type='application/x-www-form-urlencoded'
    • body: redirect_uri=<cumulus_distribution_url>/login

    where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

    For reference, see the Cognito Authentication Service API.

    Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

    brew install --cask session-manager-plugin --no-quarantine

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

    1. Determine your EC2 instance ID
    2. Connect to the NASA VPN
    3. Start an AWS SSM session
    4. Open an SSH tunnel
    5. Use a browser to navigate to your file

    To determine your EC2 instance ID for your Cumulus deployment, run the follow command where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

    aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text
    Connect to NASA VPN

    Before proceeding with the remaining steps, make sure you are connected to the NASA VPN.

    Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

    aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

    If successful, you should see output similar to the following:

    Starting session with SessionId: NGAPShApplicationDeveloper-***
    Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
    Waiting for connections...

    In another terminal window, open a tunnel with port forwarding using your chosen port from above (e.g., 9000):

    ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

    where:

    • <port> is the open local port you chose earlier (e.g., 9000)
    • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, and then next enter a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    Once you're finished testing, clean up as follows:

    1. Stop your SSH tunnel (enter Ctrl-C)
    2. Stop your AWS SSM session (enter Ctrl-C)
    3. If you like, disconnect from the NASA VPN

    While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

    Using a CloudFront URL as Your Distribution URL

    In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

    This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

    • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
    • A request to add the AWS account's VPC to the whitelist

    Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

    cumulus_distribution_url = <cloudfront_url>

    In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

    At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

    Redeploy Cumulus with your new/updated Terraform variables.

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    S3 Bucket Mapping

    An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

    The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple JSON mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }
    cumulus bucket mapping

    Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Switching from the Thin Egress App to Cumulus Distribution

    If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

    Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

    There will also be downtime while your API Gateway is updated.

    - + \ No newline at end of file diff --git a/docs/next/deployment/databases-introduction/index.html b/docs/next/deployment/databases-introduction/index.html index 7fbabf64c05..1beda482b89 100644 --- a/docs/next/deployment/databases-introduction/index.html +++ b/docs/next/deployment/databases-introduction/index.html @@ -5,13 +5,13 @@ Databases | Cumulus Documentation - +
    Version: Next

    Databases

    Cumulus Core Database

    Cumulus uses a PostgreSQL database as its primary data store for operational and archive records (e.g. collections, granules, etc). We expect a PostgreSQL database to be provided by the AWS RDS service; however, there are two types of the RDS database which we will explore in the upcoming pages.

    Types of Databases

    - + \ No newline at end of file diff --git a/docs/next/deployment/index.html b/docs/next/deployment/index.html index 06df54c6739..f36a522b7b0 100644 --- a/docs/next/deployment/index.html +++ b/docs/next/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -19,7 +19,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Choose a Distribution API

    Default Configuration

    If you are deploying from the Cumulus Deployment Template or a configuration based on that repo, the Thin Egress App (TEA) distribution app will be used by default.

    Configuration Options

    Cumulus can be configured to use either TEA or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

    note

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    Configure the Thin Egress App

    TEA can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the TEA documentation to configure distribution in your cumulus-tf deployment.

    Configure the Cumulus Distribution API (Optional)

    If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/
    note

    Be sure to copy the redirect URLs because you will need them to update your Earthdata application.

    Update Earthdata Application

    Add the two redirect URLs to your EarthData login application by doing the following:

    1. Login to URS
    2. Under My Applications -> Application Administration -> use the edit icon of your application
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1
    5. You may delete the placeholder url you used to create the application

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus Dashboard

    Dashboard Requirements

    what you will need

    The requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the Dashboard repository.

    Prepare AWS

    Create S3 Bucket for Dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install Dashboard

    To install the Cumulus Dashboard, clone the repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard Versioning

    By default, the master branch will be used for Dashboard deployments. The master branch of the repository contains the most recent stable release of the Cumulus Dashboard.

    If you want to test unreleased changes to the Dashboard, use the develop branch.

    Each release/version of the Dashboard will have a tag in the Dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the Dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the Dashboard

    caution

    These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build your dashboard from the Cumulus Dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard Deployment

    Deploy your dashboard to S3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the Dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and log in with a user that you had previously configured for access.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

    aws cost calculator

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    reindex after changes

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 Instances and Autoscaling Groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/next/deployment/postgres_database_deployment/index.html b/docs/next/deployment/postgres_database_deployment/index.html index 507cf657a97..b776bfd38ef 100644 --- a/docs/next/deployment/postgres_database_deployment/index.html +++ b/docs/next/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 11 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on GitHub.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation.

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs and the Aurora Serverless V1 docs.

    Prepare Deployment Repository

    tip

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or you need to only configure this module for your repository, skip to Prepare AWS Configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a New Repository

    Create a new repository on GitHub so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    Update Your Gitignore File

    If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS Configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:


    Configure and Deploy the Module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc.

    tip

    This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision User and User Database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true. This configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to.
    • prefix -- this value will be used to set a unique identifier for the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda and run it on each provision thus creating the configured database (if it does not exist), updating the user password (if that value has been changed), and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    note

    This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see a similar output:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    caution

    If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user's database, lambdas, and security groups):

    Output Example
    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secrets required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Connect to PostgreSQL DB via pgAdmin

    If you would like to manage your PostgreSQL database in an GUI tool, you can via pgAdmin.

    Requirements

    SSH Setup in AWS Secrets Manager

    You will need to navigate to AWS Secrets Manager and retrieve the secret values for your database. The secret name will contain the string _db_login and your prefix. Click the "Retrieve secret value" button (Retrieve secret value)to see the secret values.

    The value for your secret name can also be retrieved from the data-persistence-tf directory with the command terraform output.

    pgAdmin values to retrieve

    Setup ~/.ssh/config

    Replace HOST value and PORT value with the values retrieved from Secrets Manager.

    The LocalForward number 9202 can be any unused LocalForward number in your SSH config:

    Host ssm-proxy
    Hostname 127.0.0.1
    User ec2-user
    LocalForward 9202 [HOST value]:[PORT value]
    IdentityFile ~/.ssh/id_rsa
    Port 6868

    Create a Local Port Forward

    • Create a local port forward to the SSM box port 22, this creates a tunnel from <local ssh port> to the SSH port on the SSM host.
    caution

    <local ssh port> should not be 8000.

    • Replace the following command values for <instance id> with your instance ID:
    aws ssm start-session --target <instance id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6868
    • Then, in another terminal tab, enter:
    ssh ssm-proxy

    Create PgAdmin Server

    • Open pgAdmin and begin creating a new server (in newer versions it may be registering a new server).

    Creating a pgAdmin server

    • In the "Connection" tab, enter the values retrieved from Secrets Manager. Host name/address and Port should be the Hostname and LocalForward number from the ~/.ssh/config file.

    pgAdmin server connection value entries

    note

    Maintenance database corresponds to "database".

    You can select "Save Password?" to save your password. Click "Save" when you are finished. You should see your new server in pgAdmin.

    Query Your Database

    • In the "Browser" area find your database, navigate to the name, and click on it.

    • Select the "Query Editor" to begin writing queries to your database.

    Using the query editor in pgAdmin

    You are all set to manage your queries in pgAdmin!


    Next Steps

    Your database cluster has been created/updated! From here you can continue to add additional user accounts, databases, and other database configurations.

    - + \ No newline at end of file diff --git a/docs/next/deployment/share-s3-access-logs/index.html b/docs/next/deployment/share-s3-access-logs/index.html index 16ee4b37596..f59e2da6d3c 100644 --- a/docs/next/deployment/share-s3-access-logs/index.html +++ b/docs/next/deployment/share-s3-access-logs/index.html @@ -5,13 +5,13 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: Next

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a Node.js package that contains a simple Lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First, ensure that you have enabled S3 Server Access Logging.

    Next, configure your terraform.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with Cumulus you will need to add the module to your terraform main.tf definition as the example below:

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The Terraform source package can be found on the Cumulus GitHub Release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 Replicator as described above using the target_bucket and target_prefix provided by the Metrics team.

    The Metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    info

    For a more in-depth overview regarding ESDIS Metrics view the Cumulus Distribution Metrics section.

    - + \ No newline at end of file diff --git a/docs/next/deployment/terraform-best-practices/index.html b/docs/next/deployment/terraform-best-practices/index.html index 3f7ef613b62..9a20b325283 100644 --- a/docs/next/deployment/terraform-best-practices/index.html +++ b/docs/next/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -84,7 +84,7 @@ are any dangling resources left behind for any reason, by running the following AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    - + \ No newline at end of file diff --git a/docs/next/deployment/thin_egress_app/index.html b/docs/next/deployment/thin_egress_app/index.html index 4c3fb87107d..7219535ed29 100644 --- a/docs/next/deployment/thin_egress_app/index.html +++ b/docs/next/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App (TEA) for Cumulus Distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: Next

    Using the Thin Egress App (TEA) for Cumulus Distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA Deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a Secret for Signing Thin Egress App JWTs

    The Thin Egress App uses JSON Web Tokens (JWTs) internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    Bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple JSON mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }
    info

    Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally Configure a Custom Bucket Map

    A simple configuration would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public
    caution

    Your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally Configure Shared Variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus Core example deployment code.

    - + \ No newline at end of file diff --git a/docs/next/deployment/upgrade-readme/index.html b/docs/next/deployment/upgrade-readme/index.html index 07e7a4ceb93..6f0ce205962 100644 --- a/docs/next/deployment/upgrade-readme/index.html +++ b/docs/next/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/next/development/forked-pr/index.html b/docs/next/development/forked-pr/index.html index 4f7cfaec1e4..bcd6b155787 100644 --- a/docs/next/development/forked-pr/index.html +++ b/docs/next/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: Next

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/next/development/integration-tests/index.html b/docs/next/development/integration-tests/index.html index 7e65a5d9979..81409e55907 100644 --- a/docs/next/development/integration-tests/index.html +++ b/docs/next/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/next/development/quality-and-coverage/index.html b/docs/next/development/quality-and-coverage/index.html index 0d35ed0e75a..cf088de63bc 100644 --- a/docs/next/development/quality-and-coverage/index.html +++ b/docs/next/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/next/development/release/index.html b/docs/next/development/release/index.html index 1ee5ae9267e..edb699bb89d 100644 --- a/docs/next/development/release/index.html +++ b/docs/next/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -23,7 +23,7 @@ this is a backport and patch release on the 13.3.x series of releases. Updates that are included in the future will have a corresponding CHANGELOG entry in future releases..

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d vMAJOR.MINOR.PATCH
    git push -d origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -d v9.1.0
    git push -d origin v9.1.0
    - + \ No newline at end of file diff --git a/docs/next/docs-how-to/index.html b/docs/next/docs-how-to/index.html index fdde248e8ee..54725e82e4e 100644 --- a/docs/next/docs-how-to/index.html +++ b/docs/next/docs-how-to/index.html @@ -5,7 +5,7 @@ Cumulus Documentation: How To's | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: Next

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_APP_ID, DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve
    note

    docs-build will build the documents into website/build. docs-clear will clear the documents.

    caution

    Fix any broken links reported by Docusaurus if you see the following messages during build.

    [INFO] Docusaurus found broken links!

    Exhaustive list of all broken links found:

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---
    note

    To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walk-through can be found here. Docusaurus v2 uses snapshot approach for documentation versioning. Every versioned docs does not depends on other version. It is worth noting that we would like the Documentation versions to match up directly with release versions. However, a new versioned docs can take up a lot of repo space and require maintenance, we suggest to update existing versioned docs for minor releases when there are no significant functionality changes. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiId, apiKey and an indexName by DocSearch that we include in our website/docusaurus.config.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for these values to exist - DOCSEARCH_APP_ID, DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/next/external-contributions/index.html b/docs/next/external-contributions/index.html index 671d862ec99..628c0fc3756 100644 --- a/docs/next/external-contributions/index.html +++ b/docs/next/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: Next

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/next/faqs/index.html b/docs/next/faqs/index.html index 4f1fe79631f..90296b8c5f7 100644 --- a/docs/next/faqs/index.html +++ b/docs/next/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: Next

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General | Workflows | Integrators & Developers | Operators


    General

    What prerequisites are needed to setup Cumulus?

    Answer: Here is a list of the tools and access that you will need in order to get started. To maintain the up-to-date versions that we are using please visit our Cumulus main README for details.

    • NVM for node versioning
    • AWS CLI
    • Bash
    • Docker (only required for testing)
    • docker-compose (only required for testing pip install docker-compose)
    • Python
    • pipenv
    login credentials

    Keep in mind you will need access to the AWS console and an Earthdata account before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    Where can I find Cumulus release notes?

    Answer: To get the latest information about updates to Cumulus go to Cumulus Versions.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a Cumulus JIRA ticket.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.
    info

    For more information on how to submit an issue or contribute to Cumulus follow our guidelines at Contributing.


    Workflows

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting the Workflows section.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    Where can I find a list of workflow tasks?

    Answer: You can access a list of reusable tasks for Cumulus development at Cumulus Tasks.

    Are there any third-party workflows or applications that I can use with Cumulus?

    Answer: The Cumulus team works with various partners to help build a robust framework. You can visit our External Contributions section to see what other options are available to help you customize Cumulus for your needs.


    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    Is Cumulus customizable and flexible?

    Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

    What are Terraform modules?

    Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

    Where do I find Terraform module variables

    Answer: Go here for a list of Cumulus maintained variables.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Explore more Cumulus operator best practices and how-tos in the dedicated Operator Docs.

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/next/features/ancillary_metadata/index.html b/docs/next/features/ancillary_metadata/index.html index 774e0c16830..4cb9a600a9b 100644 --- a/docs/next/features/ancillary_metadata/index.html +++ b/docs/next/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: Next

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/next/features/backup_and_restore/index.html b/docs/next/features/backup_and_restore/index.html index 5a1c3a330d0..93533621d05 100644 --- a/docs/next/features/backup_and_restore/index.html +++ b/docs/next/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -50,7 +50,7 @@ writing to the old cluster.

  • Set the snapshot_identifier variable to the snapshot you wish to create, and configure the module like a new deployment, with a unique cluster_identifier

  • Deploy the module using terraform apply

  • Once deployed, verify the cluster has the expected data

  • Redeploy the data persistence and Cumulus deployments - You should not need to reconfigure either, as the secret ARN and the security group should not change, however double-check the configured values are as expected

  • - + \ No newline at end of file diff --git a/docs/next/features/dead_letter_archive/index.html b/docs/next/features/dead_letter_archive/index.html index 865fdc0b2bf..5ff6bbbf9c7 100644 --- a/docs/next/features/dead_letter_archive/index.html +++ b/docs/next/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: Next

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords. Otherwise, all Cumulus messages that fail to be reprocessed will be moved to a new archive location under the path <stackName>/dead-letter-archive/failed-sqs/<YYYY-MM-DD>.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/next/features/dead_letter_queues/index.html b/docs/next/features/dead_letter_queues/index.html index 2352778be6f..4cc72f317dd 100644 --- a/docs/next/features/dead_letter_queues/index.html +++ b/docs/next/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: Next

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/next/features/distribution-metrics/index.html b/docs/next/features/distribution-metrics/index.html index e90776cd651..73e56c1fe47 100644 --- a/docs/next/features/distribution-metrics/index.html +++ b/docs/next/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: Next

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/next/features/execution_payload_retention/index.html b/docs/next/features/execution_payload_retention/index.html index 22a7c795832..b3875a6c62f 100644 --- a/docs/next/features/execution_payload_retention/index.html +++ b/docs/next/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: Next

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in your RDS database and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/next/features/logging-esdis-metrics/index.html b/docs/next/features/logging-esdis-metrics/index.html index dee8f952e58..588a9ff446d 100644 --- a/docs/next/features/logging-esdis-metrics/index.html +++ b/docs/next/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: Next

    Writing logs for ESDIS Metrics

    info

    This feature is only available for Cumulus deployments in NGAP environments.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/next/features/replay-archived-sqs-messages/index.html b/docs/next/features/replay-archived-sqs-messages/index.html index 2e352775183..99033be9e26 100644 --- a/docs/next/features/replay-archived-sqs-messages/index.html +++ b/docs/next/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: Next

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/next/features/replay-kinesis-messages/index.html b/docs/next/features/replay-kinesis-messages/index.html index 42cc1d9bc89..4399d830b80 100644 --- a/docs/next/features/replay-kinesis-messages/index.html +++ b/docs/next/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: Next

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    note

    This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/next/features/reports/index.html b/docs/next/features/reports/index.html index 38801ab4830..fc5b1a357bf 100644 --- a/docs/next/features/reports/index.html +++ b/docs/next/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -19,7 +19,7 @@ report generation. The data buckets will include any buckets in your Cumulus buckets configuration that have type public, protected or private.
    - + \ No newline at end of file diff --git a/docs/next/getting-started/index.html b/docs/next/getting-started/index.html index 9873ffbd95b..58817b4b9c9 100644 --- a/docs/next/getting-started/index.html +++ b/docs/next/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: Next

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Set up Git Secrets

    To ensure your AWS access keys and passwords are protected as you submit commits we recommend setting up Git Secrets.

    2. Deploy Cumulus Core and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    3. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    4. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment. For an introduction about Terraform, go to Terraform's official site.
    • Integrator Common Use Cases: Scenarios to help integrators along in the Cumulus environment.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/next/glossary/index.html b/docs/next/glossary/index.html index a97d664db83..9b274fdf6f5 100644 --- a/docs/next/glossary/index.html +++ b/docs/next/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: Next

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: Amazon Web Services documentation.

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see Collections - Data Management Types.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page.

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Node Package Manager (npm)

    Node package manager. Often referred to as npm.

    For more information, see npm.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (npm)

    Npm hosted node.js packages. Cumulus packages can be found on npm's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data, and more.

    For more information, see AWS's S3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform.

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/next/index.html b/docs/next/index.html index 1910fc76afd..2b9b23d9f58 100644 --- a/docs/next/index.html +++ b/docs/next/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: Next

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/next/integrator-guide/about-int-guide/index.html b/docs/next/integrator-guide/about-int-guide/index.html index 46f79e85640..9d8e1524e4e 100644 --- a/docs/next/integrator-guide/about-int-guide/index.html +++ b/docs/next/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: Next

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/next/integrator-guide/int-common-use-cases/index.html b/docs/next/integrator-guide/int-common-use-cases/index.html index 5fd4b54503a..daff9170ad8 100644 --- a/docs/next/integrator-guide/int-common-use-cases/index.html +++ b/docs/next/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/integrator-guide/workflow-add-new-lambda/index.html b/docs/next/integrator-guide/workflow-add-new-lambda/index.html index c7ecbf919c4..bd6132d6028 100644 --- a/docs/next/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/next/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: Next

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/next/integrator-guide/workflow-ts-failed-step/index.html b/docs/next/integrator-guide/workflow-ts-failed-step/index.html index bf7d3b82c9f..dca66a38b84 100644 --- a/docs/next/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/next/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: Next

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/next/interfaces/index.html b/docs/next/interfaces/index.html index f4cc5355b62..7f979dc0a8c 100644 --- a/docs/next/interfaces/index.html +++ b/docs/next/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: Next

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/next/operator-docs/about-operator-docs/index.html b/docs/next/operator-docs/about-operator-docs/index.html index e7b19089188..b3ae0858c29 100644 --- a/docs/next/operator-docs/about-operator-docs/index.html +++ b/docs/next/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: Next

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/next/operator-docs/bulk-operations/index.html b/docs/next/operator-docs/bulk-operations/index.html index 777b6282c67..6ab90ba9c92 100644 --- a/docs/next/operator-docs/bulk-operations/index.html +++ b/docs/next/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: Next

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    caution

    You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal.

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform.
    note

    The rest of the process is the same regardless of what type of bulk action you perform.

    1. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    2. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    3. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    4. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    5. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    6. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    7. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    8. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      ![Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request](../assets/bulk-granules-submitted.png)

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      ![Screenshot of Kibana user interface for configuring the settings of an index pattern](../assets/kibana-create-index-pattern-2.png)

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/next/operator-docs/cmr-operations/index.html b/docs/next/operator-docs/cmr-operations/index.html index be56de8169b..d3da729deca 100644 --- a/docs/next/operator-docs/cmr-operations/index.html +++ b/docs/next/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/next/operator-docs/create-rule-in-cumulus/index.html b/docs/next/operator-docs/create-rule-in-cumulus/index.html index 9d99a258890..55d6215a400 100644 --- a/docs/next/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/next/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: Next

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    state field conditional

    If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/next/operator-docs/discovery-filtering/index.html b/docs/next/operator-docs/discovery-filtering/index.html index 72287074db3..75211d9a877 100644 --- a/docs/next/operator-docs/discovery-filtering/index.html +++ b/docs/next/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/next/operator-docs/granule-workflows/index.html b/docs/next/operator-docs/granule-workflows/index.html index f5741e34645..4d6c5a6c53b 100644 --- a/docs/next/operator-docs/granule-workflows/index.html +++ b/docs/next/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: Next

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule
    note

    Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion.

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query
    tip

    You can optionally force deletion from CMR.

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/next/operator-docs/kinesis-stream-for-ingest/index.html b/docs/next/operator-docs/kinesis-stream-for-ingest/index.html index 4795f4caa9b..78f9497ade1 100644 --- a/docs/next/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/next/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: Next

    Setup Kinesis Stream & CNM Message

    tip

    Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/next/operator-docs/locating-access-logs/index.html b/docs/next/operator-docs/locating-access-logs/index.html index 1c2f60d5b8a..450120d48aa 100644 --- a/docs/next/operator-docs/locating-access-logs/index.html +++ b/docs/next/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: Next

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/next/operator-docs/naming-executions/index.html b/docs/next/operator-docs/naming-executions/index.html index e00754c27fd..bbbbac279a9 100644 --- a/docs/next/operator-docs/naming-executions/index.html +++ b/docs/next/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    info

    This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/next/operator-docs/ops-common-use-cases/index.html b/docs/next/operator-docs/ops-common-use-cases/index.html index b4e8bfb2879..b4a954a3dc1 100644 --- a/docs/next/operator-docs/ops-common-use-cases/index.html +++ b/docs/next/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/docs/next/operator-docs/trigger-workflow/index.html b/docs/next/operator-docs/trigger-workflow/index.html index 4f68c224135..e039a2145ed 100644 --- a/docs/next/operator-docs/trigger-workflow/index.html +++ b/docs/next/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: Next

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page unless it was created with a DISABLED state.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page unless it has a DISABLED state. In order to run a workflow with a onetime DISABLED rule, please change the rule state to ENABLED and re-run. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/next/tasks/index.html b/docs/next/tasks/index.html index b32824306a6..6d3d7339a77 100644 --- a/docs/next/tasks/index.html +++ b/docs/next/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: Next

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/orca-copy-to-archive-adapter

    Adapter to invoke orca copy-to-archive lambda


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/next/team/index.html b/docs/next/team/index.html index e8d6a836aee..7d67e14d7ed 100644 --- a/docs/next/team/index.html +++ b/docs/next/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - +
    Version: Next

    Cumulus Team

    Cumulus Core Team

    Cumulus Emeritus Team

    - + \ No newline at end of file diff --git a/docs/next/troubleshooting/index.html b/docs/next/troubleshooting/index.html index 5b49cb44e4a..f6919b6b626 100644 --- a/docs/next/troubleshooting/index.html +++ b/docs/next/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: Next

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/next/troubleshooting/reindex-elasticsearch/index.html b/docs/next/troubleshooting/reindex-elasticsearch/index.html index 7f6371b07c7..a92aa6c44f4 100644 --- a/docs/next/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/next/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/next/troubleshooting/rerunning-workflow-executions/index.html b/docs/next/troubleshooting/rerunning-workflow-executions/index.html index 4beebd44f21..75784e94a25 100644 --- a/docs/next/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/next/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Rerunning workflow executions | Cumulus Documentation - +
    Version: Next

    Rerunning workflow executions

    To rerun a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/next/troubleshooting/troubleshooting-deployment/index.html b/docs/next/troubleshooting/troubleshooting-deployment/index.html index dd12a75fd35..e9503f6d6aa 100644 --- a/docs/next/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/next/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install Dashboard

    Dashboard Configuration

    Issues

    Not Able To Clear Cache

    Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    Workaround Option

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard Deployment

    Issues

    Earthdata Login Error

    The dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting".

    Check your variables and values

    Check to see if you are missing or have forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.

    Caching Issue

    There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error.

    browser solution

    If you experience this, attempt to access the dashboard in a new browser window, and it should work.

    - + \ No newline at end of file diff --git a/docs/next/upgrade-notes/cumulus_distribution_migration/index.html b/docs/next/upgrade-notes/cumulus_distribution_migration/index.html index 8abda409f61..05a07f3f121 100644 --- a/docs/next/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/next/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: Next

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/next/upgrade-notes/migrate_tea_standalone/index.html b/docs/next/upgrade-notes/migrate_tea_standalone/index.html index d0884bf5173..183f14f9c00 100644 --- a/docs/next/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/next/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: Next

    Migrate TEA deployment to standalone module

    Background

    info

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    note

    These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code.

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/next/upgrade-notes/update-cma-2.0.2/index.html b/docs/next/upgrade-notes/update-cma-2.0.2/index.html index 7e41684fd8e..87aeb98baa8 100644 --- a/docs/next/upgrade-notes/update-cma-2.0.2/index.html +++ b/docs/next/upgrade-notes/update-cma-2.0.2/index.html @@ -5,13 +5,13 @@ Upgrade to CMA 2.0.2 | Cumulus Documentation - +
    Version: Next

    Upgrade to CMA 2.0.2

    Updating a Cumulus Deployment to CMA 2.0.2

    Background

    The Cumulus Message Adapter has been updated in release 2.0.2 to no longer utilize the AWS step function API to look up the defined name of a step function task for population in meta.workflow_tasks, but instead use an incrementing integer field.

    Additionally a bugfix was released in the form of v2.0.1/v2.0.2 following the initial 2.0.0 release, so all users should update to release 2.0.2

    The update is not tied to a particular version of Core, however the update should be done across all task components in order to ensure consistent execution records.

    Changes

    Execution Record Update

    This update functionally means that Cumulus tasks/activities using the CMA will now record a record that looks like the following in meta.workflowtasks, and more importantly in the tasks column for an execution record:

    Original

          "DiscoverGranules": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "QueueGranules": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    New

          "0": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "1": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    Actions Required

    The following should be done as part of a Cumulus stack update to utilize cumulus message adapter > 2.0.2:

    • Python tasks that utilize cumulus-message-adapter-python should be updated to use > 2.0.0, their lambdas rebuilt and Cumulus workflows reconfigured to use the updated version.

    • Python activities that utilize cumulus-process-py should be rebuilt using > 1.0.0 with updated dependencies, and have their images deployed/Cumulus configured to use the new version.

    • The cumulus-message-adapter v2.0.2 lambda layer should be made available in the deployment account, and the Cumulus deployment should be reconfigured to use it (via the cumulus_message_adapter_lambda_layer_version_arn variable in the cumulus module). This should address all Core node.js tasks that utilize the CMA, and many contributed node.js/JAVA components.

    Once the above have been done, redeploy Cumulus to apply the configuration and the updates should be live.

    - + \ No newline at end of file diff --git a/docs/next/upgrade-notes/update-task-file-schemas/index.html b/docs/next/upgrade-notes/update-task-file-schemas/index.html index 0f5b829be7c..6f87fe4abf1 100644 --- a/docs/next/upgrade-notes/update-task-file-schemas/index.html +++ b/docs/next/upgrade-notes/update-task-file-schemas/index.html @@ -5,13 +5,13 @@ Updates to task granule file schemas | Cumulus Documentation - +
    Version: Next

    Updates to task granule file schemas

    Background

    Most Cumulus workflow tasks expect as input a payload of granule(s) which contain the files for each granule. Most tasks also return this same granule structure as output.

    However, up to this point, there was inconsistency in the schemas for the granule files objects expected by each task. Furthermore, there was no guarantee of consistency between granule files objects as stored in the database and the expectations of any given workflow task.

    Thus, when performing bulk granule operations which pass granules from the database into a Cumulus workflow, it was possible for there to be schema validation failures depending on which task was used to start the workflow and its particular schema.

    In order to rectify this situation, CUMULUS-2388 was filed and addressed to create a common granule files schema between nearly all of the Cumulus tasks (exceptions discussed below) and the Cumulus database. The following documentation explains the manual changes you need to make to your deployment in order to be compatible with the updated files schema.

    Updated files schema

    The updated granule files schema can be found here.

    These former properties were deprecated (with notes about how to derive the same information from the updated schema, if possible):

    • filename - concatenate the bucket and key values with a directory separator (/)
    • name - use fileName property
    • etag - ETags are no longer provided as an individual file property. Instead, a separate etags object mapping S3 URIs to ETag values is provided as output from the following workflow tasks (guidance on how to integrate this output with your workflows is provided in the Upgrading your workflows section below):
      • update-granules-cmr-metadata-file-links
      • hyrax-metadata-updates
    • fileStagingDir - no longer supported
    • url_path - no longer supported
    • duplicate_found - This property is no longer supported, however sync-granule and move-granules now produce a separate granuleDuplicates object as part of their output. The granuleDuplicates object is a map of granules by granule ID which includes the files that encountered duplicates during processing. Guidance on how to integrate granuleDuplicates information into your workflow configuration is provided below.

    Exceptions

    These workflow tasks did not have their schema for granule files updated:

    • discover-granules - no updates
    • queue-granules - no updates
    • parse-pdr - no updates
    • sync-granule - input schema not updated, output schema was updated

    The reason that these task schemas were not updated is that all of these tasks start before the files have been ingested to S3, thus much of the information that is required in the updated files schema like bucket, key, or checksum is not yet known.

    Bulk granule operations

    Since the input schema for the above tasks was not updated, that means you cannot run bulk granule operations against workflows if they start with any of those tasks. Bulk granule operations work by loading the specified granules from the database and sending them as input to a specified workflow, so if the specified workflow begins with a task whose input schema does not conform to what is coming out of the database, there will be schema errors.

    Upgrading your deployment

    Upgrading your workflows

    For any workflows using the update-granules-cmr-metadata-file-links task before the hyrax-metadata-updates and/or post-to-cmr tasks, update the step definition for update-granules-cmr-metadata-file-links as follows:

        "UpdateGranulesCmrMetadataFileLinksStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    hyrax-metadata-updates

    For any workflows using the hyrax-metadata-updates task before a post-to-cmr task, update the definition of the hyrax-metadata-updates step as follows:

        "HyraxMetadataUpdatesTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    post-to-cmr

    For any workflows using post-to-cmr task after the update-granules-cmr-metadata-file-links or hyrax-metadata-updates tasks, update the post-to-cmr step definition as follows:

        "CmrStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}"
    }
    }
    },
    ...more configuration...

    Example workflow

    For an example workflow integrating all of these changes, please see our example ingest and publish workflow.

    Optional - Integrate granuleDuplicates information
    View Details
    note

    The granuleDuplicates output is purely informational and does not have any bearing on the separate configuration for how duplicates should be handled.

    You can include granuleDuplicates output from the sync-granule or move-granules tasks in your workflow messages like so:

        "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    ...other config...
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granuleDuplicates}",
    "destination": "{$.meta.sync_granule.granule_duplicates}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    }
    ...more configuration...

    The result of this configuration is that the granuleDuplicates output from sync-granule would be placed in meta.sync_granule.granule_duplicates on the workflow message and remain there throughout the rest of the workflow. The same configuration could be replicated for the move-granules task, but be sure to use a different destination in the workflow message for the granuleDuplicates output.

    Updating collection URL path templates

    Collections can specify url_path templates to dynamically generate the final location of files. As part of url_path templates, file object properties can be interpolated to generate the file path. Thus, these url_path templates need to be updated to ensure that they are compatible with the updated files schema and the properties that will actually be available on file objects.

    See the notes on the updated files schema to know which properties are available and which previously existing properties were deprecated.

    As an example, you will want to update any url_path properties in your collections to remove references to file.name and replace them with references to file.fileName like so:

    - "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.name, 0, 3)}",
    + "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.fileName, 0, 3)}",
    - + \ No newline at end of file diff --git a/docs/next/upgrade-notes/upgrade-rds-phase-3-release/index.html b/docs/next/upgrade-notes/upgrade-rds-phase-3-release/index.html index 2d0a63a4814..bb20c15862f 100644 --- a/docs/next/upgrade-notes/upgrade-rds-phase-3-release/index.html +++ b/docs/next/upgrade-notes/upgrade-rds-phase-3-release/index.html @@ -5,14 +5,14 @@ Upgrade RDS Phase 3 Release | Cumulus Documentation - +
    Version: Next

    Upgrade RDS Phase 3 Release

    Background

    Release v16 of Cumulus Core includes an update to remove the now-unneeded AWS DynamoDB tables for the primary archive, as this datastore has been fully migrated to PostgreSQL databases in prior releases, and should have been operating in a parallel write mode to allow for repair/remediation of prior issues.

    Requirements

    To update to this release (and beyond) users must:

    • Have deployed a release of at least version 11.0.0 (preferably at least the latest supported minor version in the 11.1.x release series), having successfully completed the transition to using PostgreSQL as the primary datastore in release 11
    • Completed evaluation of the primary datastore for data irregularities that might be resolved by re-migration of data from the DynamoDB datastores.
    • Review the CHANGELOG for any migration instructions/changes between (and including) this release and the release you're upgrading from. Complete migration instructions from the previous release series should be included in release notes/CHANGELOG for this release, this document notes migration instructions specifically for release 16.0.0+, and is not all-inclusive if upgrading from multiple prior release versions.
    • Configure your deployment terraform environment to utilize the new release, noting all migration instructions.
    • The PostgreSQL database cluster should be updated to the supported version (Aurora Postgres 11.13+ compatible)

    Suggested Prerequisites

    In addition to the above requirements, we suggest users:

    • Retain a backup of the primary DynamoDB datastore in case of recovery/integrity concerns exist between DynamoDB and PostgreSQL.

      This should only be considered if remediation/re-migration from DynamoDB has recently occurred, specifically due to the issues reported in the following tickets:

      • CUMULUS-3019
      • CUMULUS-3024
      • CUMULUS-3017

      and other efforts included in the outcome from CUMULUS-3035/CUMULUS-3071.

    • Halt all ingest prior to performing the version upgrade.

    • Run load testing/functional testing

      While the majority of the modifications for release 16 are related to DynamoDB removal, we always encourage user engineering teams ensure compatibility at scale with their deployment's engineering configuration prior to promotion to a production environment to ensure a smooth upgrade.

    Upgrade procedure

    1. (Optional) Halt ingest

    If ingest is not halted, once the data-persistence module is deployed but the main Core module is not deployed, existing database writes will fail, resulting in in-flight workflow messages failing to the message Dead Letter Archive, and all API write related calls failing.

    While this is optional, it is highly encouraged, as cleanup could be significant.

    2. Deploy the data persistence module

    Ensure your source for the data-persistence module is set to the release version (substituting v16.0.0 for the latest v16 release):

      source = "https://github.com/nasa/cumulus/releases/download/v16.0.0/terraform-aws-cumulus.zip//tf-modules/data-persistence"

    Run terraform init to bring all updated source modules, then run terraform apply and evaluate the changeset before proceeding. The changeset should include blocks like the following for each table removed:

    # module.data_persistence.aws_dynamodb_table.collections_table will be destroyed
    # module.data_persistence.aws_dynamodb_table.executions_table will be destroyed
    # module.data_persistence.aws_dynamodb_table.files_table will be destroyed
    # module.data_persistence.aws_dynamodb_table.granules_table will be destroyed
    # module.data_persistence.aws_dynamodb_table.pdrs_table will be destroyed

    In addition, you should expect to see the outputs from the module remove the references to the DynamoDB tables:

    Changes to Outputs:
    ~ dynamo_tables = {
    access_tokens = {
    arn = "arn:aws:dynamodb:us-east-1:XXXXXX:table/prefix-AccessTokensTable"
    name = "prefix-AccessTokensTable"
    }
    async_operations = {
    arn = "arn:aws:dynamodb:us-east-1:XXXXXX:table/prefix-AsyncOperationsTable"
    name = "prefix-AsyncOperationsTable"
    }
    - collections = {
    - arn = "arn:aws:dynamodb:us-east-1:XXXXXX:table/prefix-CollectionsTable"
    - name = "prefix-CollectionsTable"
    } -> null
    - executions = {
    - arn = "arn:aws:dynamodb:us-east-1:XXXXXX:table/prefix-ExecutionsTable"
    - name = "prefix-ExecutionsTable"
    } -> null
    - files = {
    - arn = "arn:aws:dynamodb:us-east-1:XXXXXX:table/prefix-FilesTable"
    - name = "prefix-FilesTable"
    } -> null
    - granules = {
    - arn = "arn:aws:dynamodb:us-east-1:XXXXXX:table/prefix-GranulesTable"
    - name = "prefix-GranulesTable"
    } -> null
    - pdrs = {
    - arn = "arn:aws:dynamodb:us-east-1:XXXXXX:table/prefix-PdrsTable"
    - name = "prefix-PdrsTable"
    } -> null

    Once this completes successfully, proceed to the next step.

    Deploy cumulus-tf module

    Ensure your source for the data-persistence module is set to the release version (substituting v16.0.0 for the latest v16 release):

    source = "https://github.com/nasa/cumulus/releases/download/v16.0.0/terraform-aws-cumulus.zip//tf-modules/cumulus"

    You should expect to see a significant changeset in Core provided resources, in addition to the following resources being destroyed from the RDS Phase 3 update set:

    # module.cumulus.module.archive.aws_cloudwatch_log_group.granule_files_cache_updater_logs will be destroyed
    # module.cumulus.module.archive.aws_iam_role.granule_files_cache_updater_lambda_role will be destroyed
    # module.cumulus.module.archive.aws_iam_role.migration_processing will be destroyed
    # module.cumulus.module.archive.aws_iam_role_policy.granule_files_cache_updater_lambda_role_policy will be destroyed
    # module.cumulus.module.archive.aws_iam_role_policy.migration_processing will be destroyed
    # module.cumulus.module.archive.aws_iam_role_policy.process_dead_letter_archive_role_policy will be destroyed
    # module.cumulus.module.archive.aws_iam_role_policy.publish_collections_lambda_role_policy will be destroyed
    # module.cumulus.module.archive.aws_iam_role_policy.publish_executions_lambda_role_policy will be destroyed
    # module.cumulus.module.archive.aws_iam_role_policy.publish_granules_lambda_role_policy will be destroyed
    # module.cumulus.module.archive.aws_lambda_event_source_mapping.granule_files_cache_updater will be destroyed
    # module.cumulus.module.archive.aws_lambda_event_source_mapping.publish_pdrs will be destroyed
    # module.cumulus.module.archive.aws_lambda_function.execute_migrations will be destroyed
    # module.cumulus.module.archive.aws_lambda_function.granule_files_cache_updater will be destroyed
    # module.cumulus.module.data_migration2.aws_iam_role.data_migration2 will be destroyed
    # module.cumulus.module.data_migration2.aws_iam_role_policy.data_migration2 will be destroyed
    # module.cumulus.module.data_migration2.aws_lambda_function.data_migration2 will be destroyed
    # module.cumulus.module.data_migration2.aws_security_group.data_migration2[0] will be destroyed
    # module.cumulus.module.postgres_migration_async_operation.aws_iam_role.postgres_migration_async_operation_role will be destroyed
    # module.cumulus.module.postgres_migration_async_operation.aws_iam_role_policy.postgres_migration_async_operation will be destroyed
    # module.cumulus.module.postgres_migration_async_operation.aws_lambda_function.postgres-migration-async-operation will be destroyed
    # module.cumulus.module.postgres_migration_async_operation.aws_security_group.postgres_migration_async_operation[0] will be destroyed
    # module.cumulus.module.postgres_migration_count_tool.aws_iam_role.postgres_migration_count_role will be destroyed
    # module.cumulus.module.postgres_migration_count_tool.aws_iam_role_policy.postgres_migration_count will be destroyed
    # module.cumulus.module.postgres_migration_count_tool.aws_lambda_function.postgres_migration_count_tool will be destroyed
    # module.cumulus.module.postgres_migration_count_tool.aws_security_group.postgres_migration_count[0] will be destroyed

    Possible deployment issues

    Security group deletion

    The following security group resources will be deleted as part of this update:

    module.cumulus.module.data_migration2.aws_security_group.data_migration2[0]
    module.cumulus.module.postgres_migration_count_tool.aws_security_group.postgres_migration_count[0]
    module.cumulus.module.postgres_migration_async_operation.aws_security_group.postgres_migration_async_operation[0]

    Because the AWS resources associated with these security groups can take some time to be properly updated (in testing this was 20-35 minutes), these deletions may cause the deployment to take some time. If for some unexpected reason this takes longer than expected this causes the update to time out, you should be able to continue the deployment by re-running terraform to completion.

    Users may also opt to attempt to reassign the affected Network Interfaces from the Security Group/deleting the security group manually if this situation occurs and the deployment time is not desirable.

    - + \ No newline at end of file diff --git a/docs/next/upgrade-notes/upgrade-rds/index.html b/docs/next/upgrade-notes/upgrade-rds/index.html index 01710857d34..e525c51552e 100644 --- a/docs/next/upgrade-notes/upgrade-rds/index.html +++ b/docs/next/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/next/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/next/upgrade-notes/upgrade_tf_version_0.13.6/index.html index 7f285680331..ff99859a471 100644 --- a/docs/next/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/next/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: Next

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the upgrade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        caution

        Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/next/workflow_tasks/discover_granules/index.html b/docs/next/workflow_tasks/discover_granules/index.html index 51709419122..036fa646dcb 100644 --- a/docs/next/workflow_tasks/discover_granules/index.html +++ b/docs/next/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/next/workflow_tasks/files_to_granules/index.html b/docs/next/workflow_tasks/files_to_granules/index.html index 03507e7b96e..e31679b910f 100644 --- a/docs/next/workflow_tasks/files_to_granules/index.html +++ b/docs/next/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: Next

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • size
    • bucket
    • key
    • fileName

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/next/workflow_tasks/lzards_backup/index.html b/docs/next/workflow_tasks/lzards_backup/index.html index f6298baf27f..d1815b95b93 100644 --- a/docs/next/workflow_tasks/lzards_backup/index.html +++ b/docs/next/workflow_tasks/lzards_backup/index.html @@ -5,13 +5,13 @@ LZARDS Backup | Cumulus Documentation - +
    Version: Next

    LZARDS Backup

    The LZARDS backup task takes an array of granules and initiates backup requests to the LZARDS API, which will be handled asynchronously by LZARDS.

    info

    For more information about LZARDS and the backup process go to the LZARDS Overview.

    Deployment

    The LZARDS backup task is not automatically deployed with Cumulus. To deploy the task through the Cumulus module, first you must specify a lzards_launchpad_passphrase in your terraform variables (e.g. variables.tf) like so:

    variable "lzards_launchpad_passphrase" {
    type = string
    default = ""
    }

    Then you can specify a value for your lzards_launchpad_passphrase in terraform.tfvars like so:

    lzards_launchpad_passphrase = your-passphrase

    Lastly, you need to make sure that the lzards_launchpad_passphrase is passed into the Cumulus module (in main.tf) like so:

    lzards_launchpad_passphrase  = var.lzards_launchpad_passphrase

    In short, deploying the LZARDS task requires configuring a passphrase variable and ensuring that your TF configuration passes that variable into the Cumulus module.

    Additional terraform configuration for the LZARDS task can be found in the cumulus module's variables.tf file, where the the relevant variables are prefixed with lzards_. You can add these variables to your deployment using the same process outlined above for lzards_launchpad_passphrase.

    Task Inputs

    Input

    This task expects an array of granules as input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Task Outputs

    Output

    The LZARDS task outputs a composite object containing:

    • the input granules array, and
    • a backupResults object that describes the results of LZARDS backup attempts.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    - + \ No newline at end of file diff --git a/docs/next/workflow_tasks/move_granules/index.html b/docs/next/workflow_tasks/move_granules/index.html index 835febe005f..a2729721b18 100644 --- a/docs/next/workflow_tasks/move_granules/index.html +++ b/docs/next/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: Next

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations
      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.
      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.
    invalid CNM type

    Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file.

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow.

    - + \ No newline at end of file diff --git a/docs/next/workflow_tasks/parse_pdr/index.html b/docs/next/workflow_tasks/parse_pdr/index.html index fcfe095e0d1..8e55a5034b6 100644 --- a/docs/next/workflow_tasks/parse_pdr/index.html +++ b/docs/next/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: Next

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow.

    - + \ No newline at end of file diff --git a/docs/next/workflow_tasks/queue_granules/index.html b/docs/next/workflow_tasks/queue_granules/index.html index b4c636bab95..b673fca2985 100644 --- a/docs/next/workflow_tasks/queue_granules/index.html +++ b/docs/next/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: Next

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/next/workflows/cumulus-task-message-flow/index.html b/docs/next/workflows/cumulus-task-message-flow/index.html index d3f327d0122..93c520cfece 100644 --- a/docs/next/workflows/cumulus-task-message-flow/index.html +++ b/docs/next/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: Next

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/next/workflows/developing-a-cumulus-workflow/index.html b/docs/next/workflows/developing-a-cumulus-workflow/index.html index d6020b3a16c..6b26bbe68af 100644 --- a/docs/next/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/next/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: Next

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    note

    Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/next/workflows/developing-workflow-tasks/index.html b/docs/next/workflows/developing-workflow-tasks/index.html index 94775187776..98085d141b6 100644 --- a/docs/next/workflows/developing-workflow-tasks/index.html +++ b/docs/next/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: Next

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/next/workflows/docker/index.html b/docs/next/workflows/docker/index.html index db8864e3ec3..a42f808e9a8 100644 --- a/docs/next/workflows/docker/index.html +++ b/docs/next/workflows/docker/index.html @@ -5,13 +5,13 @@ Dockerizing Data Processing | Cumulus Documentation - +
    Version: Next

    Dockerizing Data Processing

    The software used for processing data amongst DAAC's is developed in a variety of languages, and with different sets of dependencies and build environments. To standardize processing, Docker allows us to provide an environment (called an image) to meet the needs of any processing software, while running on the kernel of the host server (in this case, an EC2 instance). This lightweight virtualization does not carry the overhead of any additional VM, providing near-instant startup and the ability to run any dockerized process as a command-line call.

    Using Docker

    Docker images are run using the docker command and can be used to build a Docker image from a Dockerfile, fetch an existing image from a remote repository, or run an existing image. In Cumulus, docker-compose is used to help developers by making it easy to build images locally and test them.

    To run a command using docker-compose use:

    docker-compose run *command*

    where command is one of

    • build: Build and tag the image using the Dockerfile
    • bash: Run the Dockerfile interactively (via a bash shell)
    • test: Processes data in the directory data/input and saves the output to the data/test-output directory. These directories must exist.

    The Docker Registry

    Docker images that are built can be stored in the cloud in a Docker registry. Currently we are using the AWS Docker Registry, called ECR. To access these images, you must first log in using your AWS credentials, and use AWS CLI to get the proper login string:

    # install awscli
    pip install awscli

    # login to the AWS Docker registry
    aws ecr get-login --region us-east-1 | source /dev/stdin

    As long as you have permissions to access the NASA Cumulus AWS account, this will allow you to pull images from AWS ECR, and push rebuilt or new images there as well. Docker-compose may also be used to push images.

    docker-compose push

    Which will push the built image to AWS ECR. Note that the image built by docker-compose will have is the :latest tag, and will overwrite the :latest tagged docker image on the registry. This file should be updated to push to a different tag if overwriting is not desired.

    In normal use-cases for most production images on either repository, CircleCI takes care of this building and deploying process

    Source Control and Versions

    All the code necessary for processing a data collection, and the code used to create a Docker image for it, is contained within a single GitHub repository, following the naming convention docker-${dataname}, where dataname is the collection's short name. The git develop branch is the current development version, master is the latest release version, and a git tag exists for each tagged version (e.g., v0.1.3).

    Docker images can have multiple tagged versions. The Docker images in the registry follow this same convention. A Docker image tagged as 'develop' is an image of the development branch. 'latest' is the master brach, and thus the latest tagged version, with an additional tagged image for each version tagged in the git repository.

    The generation of the released tagged images are created and deployed automatically with Circle-CI, the continuous integration system used by Cumulus. When new commits are merged into a branch, the appropriate Docker image is built, tested, and deployed to the Docker registry. More on testing below.

    Docker Images

    docker-base

    Docker images are built in layers, allowing common dependencies to be shared to child Docker images. A base docker image is provided that includes some dependencies shared among the current HS3 data processing codes. This includes NetCDF libraries, AWS Cli, Python, Git, as well as py-cumulus, a collection of Python utilities that are used in the processing scripts. The docker-base repository is used to generate new images that are then stored in AWS ECR.

    The docker-base image can be interacted with by running it in interactive mode (ie, docker run -it docker-base, since the default "entrypoint" to the image is a bash shell.

    docker-data example: docker-hs3-avaps

    To create a new processing stream for a data collection, a Dockerfile is used to specify what additional dependencies may be required, and to build them in that environment, if necessary. An example Dockerfile is shown here, for the hs3avaps collection.

    # cumulus processing Dockerfile: docker-hs3-avaps

    FROM 000000000000.dkr.ecr.us-east-1.amazonaws.com/cumulus-base:latest

    # copy needed files
    WORKDIR /work
    COPY . /work

    RUN apt-get install -y nco libhdf5-dev

    # compile code
    RUN gcc convert/hs3cpl2nc.c -o _convert -I/usr/include/hdf5/serial -L/usr/include/x86_64-linux-gnu -lnetcdf -lhdf5_serial

    # input and output directories will be Data Pipeline staging dir env vars
    ENTRYPOINT ["/work/process.py"]
    CMD ["input", "output"]

    When this Dockerfile is built, docker will first use the latest cumulus-base image. It will then copy the entire GitHub repository (the processing required for a single data collection is a repository) to the /work directory which will now contain all the code necessary to process this data. In this case, a C file is compiled to convert the supplied hdf5 files to NetCDF files. Note that this also requires installing the system libraries nco and libhdf5-dev via apt-get. Lastly, the Dockerfile sets the entrypoint to the processing handler, so that this command is run when the image is run. It expects two arguments to be handed to it: 'input' and 'output' meaning the input and output directories.

    Process Handler

    All of the processing is managed through a handler, which is called when the docker image is run. Currently, Python is used for the process handler, which provides a simple interface to perform validation, run shell commands, test the output generated, and log the output for us. The handler function takes two arguments: input directory and output directory. Any other needed parameters are set via environment variables. The handler function will process the input directory, and put any output to be saved in the output directory.

    Py-cumulus

    The py-cumulus library provides some helper functions that can be used for logging, writing metadata, and testing. Py-cumulus is installed in the docker-base image. Currently, there are three modules:

    import cumulus.logutils
    import cumulus.metadata
    import cumulus.process

    Example process handler

    An example process handler is given here, in this case a shortened version of the hs3-cpl data collection. The main function at the bottom passes the provided input and output directory arguments to the process() function. The first thing process() does is to get the Cumulus logger. The Cumulus logger will send output to both stdout and Splunk, to be used in the Cumulus pipeline. Log strings are made using the make_log_string() function which properly formats a message to be handled by Splunk.

    #!/usr/bin/env python

    import os
    import sys
    import glob
    import re
    import datetime
    import subprocess
    from cumulus.logutils import get_logger, make_log_string
    from cumulus.metadata import write_metadata
    from cumulus.process import check_output

    # the main process handler
    def process(indir, outdir):
    """ Process this directory """
    log = get_logger()
    log.info(
    make_log_string(process='processing', message="Processing %s into %s" % (indir, outdir))
    )

    dataname = 'cpl'
    dataid = os.getenv('SHORT_NAME', 'hs3cpl')

    for f in glob.glob(os.path.join(indir, '*.hdf5')):
    bname = os.path.basename(f)
    log.info(
    make_log_string(granule_id=bname, process='processing', message="Processing started for %s" % bname)
    )

    # convert file to netcdf
    cmd = ['/work/_convert', f, outdir]
    out = subprocess.check_output(cmd)
    fout = glob.glob(os.path.join(outdir, 'HS3_%s*.nc' % bname[0:7]))
    fout = '' if len(fout) == 0 else fout[0]
    check_output(fout)
    cmd = ['ncatted -h -a Conventions,global,c,c,"CF-1.6" %s' % fout]
    out = subprocess.check_output(cmd, shell=True)
    log.debug(out)

    # write metadata output
    write_metadata(fout, dataname=dataname, dataid=dataid, outdir=outdir)

    # remove the generated metadata files
    for f in glob.glob(os.path.join(outdir, '*.met')):
    os.remove(f)

    if __name__ == "__main__":
    indir = sys.argv[1]
    outdir = sys.argv[2]
    process(indir, outdir)

    After setting up logging the code has a for-loop for processing any matching hdf5 in the input directory:

    1. Convert to NetCDF with a C script
    2. Validate the output (in this case just check for existence)
    3. Use 'ncatted' to update the resulting file to be CF-compliant
    4. Write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/next/workflows/index.html b/docs/next/workflows/index.html index 21ab6616f63..e43e52a6774 100644 --- a/docs/next/workflows/index.html +++ b/docs/next/workflows/index.html @@ -5,13 +5,13 @@ Workflows Overview | Cumulus Documentation - +
    Version: Next

    Workflows Overview

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/next/workflows/input_output/index.html b/docs/next/workflows/input_output/index.html index a3c20b0262d..4658a1f4446 100644 --- a/docs/next/workflows/input_output/index.html +++ b/docs/next/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: Next

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    note

    Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    python runtime

    It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    note

    The non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/next/workflows/lambda/index.html b/docs/next/workflows/lambda/index.html index 42ac70609bc..92e22a387ec 100644 --- a/docs/next/workflows/lambda/index.html +++ b/docs/next/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: Next

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }
    configuration example

    This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/next/workflows/message_granule_writes/index.html b/docs/next/workflows/message_granule_writes/index.html index 5f828aefb65..c8b81bbc2c7 100644 --- a/docs/next/workflows/message_granule_writes/index.html +++ b/docs/next/workflows/message_granule_writes/index.html @@ -5,13 +5,13 @@ Workflow Message Granule Writes | Cumulus Documentation - +
    Version: Next

    Workflow Message Granule Writes

    Overview

    When an AWS Step Function Event occurs for a Cumulus workflow or a write is attempted via the sf-sqs-report task a message is dispatched to the sfEventSqsToDbRecordsInputQueue for processing.

    Messages on the sfEventSqsToDbRecordsInputQueue (which correspond to lambda invocations or workflow events) are processed in batches of 10 and the sfEventSqsToDbRecords Lambda is triggered for each. The corresponding execution/PDR is attempted to write, then the granule records associated with the message are also attempted to be written.

    For each granule in the batch of granules one of the following occurs:

    • The granule is written successfully.
    • The granule write is dropped, due to asynchronous write constraints.
    • The lambda fails to write the granule in an unexpected way (e.g. lambda failure, AWS outage, etc). In this case, the granule will become visible again after the sfEventSqsToDbRecordsInputQueue visibility timeout (currently set as a function of the rds_connection_timing_configuration terraform variable:
    var.rds_connection_timing_configuration.acquireTimeoutMillis / 1000) + 60
    • The granule fails to write due to a schema violation, database connection issue or other expected/caught error. The message is immediately written to the Dead Letter Archive for manual intervention/investigation.

    Caveats

    • All non-bulk Cumulus API granule operations are not constrained by this logic and do not utilize the SQS update queue. They are instead invoked synchronously and follow expected RESTful logic without any asynchronous write constraints or default message values.
    • This information is correct as of release v16 of Cumulus Core. Please review the CHANGELOG and migration instructions for updated features/changes/bugfixes.

    Granule Write Constraints

    For each granule to be written, the following constraints apply:

    • granuleId must be unique.

      Granule write will not be allowed if granuleId already exists in the database for another collection, granules in this state will be rejected to write and wind up in the Dead Letter Archive

    • Message granule must match the API Granule schema.

      If not the write will be rejected, the granule status will be updated to failed, and the message will wind up in the Dead Letter Archive

    • If the granule is being updated to a running/queued status:

      • Only status, timestamp, updated_at and created_at are updated. All other values are retained as they currently exist in the database.
      • The write will only be allowed if the following are true, else the write request will be ignored as out-of-order/stale:
        • The granule createdAt value is newer or the same as the existing record.
        • If the granule is being updated to running, the execution the granule is being associated with doesn’t already exist in the following states: completed, failed.
        • If the granule is being updated to queued, the execution the granule is being associated with does not exist in any state in the database.
    • If the granule is being updated to a failed/completed state:

      • All fields provided will override existing values in the database, if any.
      • The write will only be allowed if the following are true, else the write request will be ignored as out-of-order/stale:
        • The granule createdAt value is newer or the same as the existing record.

    Message Granule Write Behavior

    The granule object values are set based on the incoming Cumulus Message values (unless otherwise specified the message values overwrite the granule payload values):

    ColumnValue
    collectionDerived from meta.collection.name and meta.collection.version
    createdAtDefaults to cumulus_meta.workflow_start_time, else payload.granule.createdAt
    durationCalculated based on the delta between cumulus_meta.workflow_start_time and when the database message writes
    errorObject taken directly from the message.error object
    executionDerived from cumulus_meta.state_machine and cumulus_meta.execution_name
    filesTaken directly from payload.granule.files. If files is null, set it to an empty list []
    pdrNameTaken directly from payload.pdr.name
    processingEndDateTimeDerived from AWS API interrogation (sfn().describeExecution) based on execution value
    processingStartDateTimeDerived from AWS API interrogation (sfn().describeExecution) based on execution value
    productVolumeSums the values of the passed in payload.granules.files.size. Does not validate against S3
    providerInferred from meta.provider value in cumulus message
    publishedTaken directly from granule.published, if not specified or null is specified, defaults to false
    queryFieldsObject taken directly from meta.granule.queryFields
    statusTaken directly from meta.status
    statusUses meta.status if provided, else payload.granule.status
    timeStampSet to the date-time value for the sfEventSqsToDbRecords invocation
    timeToArchiveTaken from payload.granule.post_to_cmr_duration/1000, provided by Core task or user task. Value will be set to zero if no value set
    timeToPreprocesspayload.granule.sync_granule_duration, provided by core or user task. Will set to 0 if value is not set
    updatedAtSet to the date-time value for the sfEventSqsToDbRecords invocation
    beginningDateTimeSee: CMR Temporal Values section below
    endingDateTimeSee: CMR Temporal Values section below
    productionDateTimeSee: CMR Temporal Values section below
    lastUpdateDateTimeSee: CMR Temporal Values section below

    CMR Temporal Values

    The following fields are generated based on values in the associated granule CMR file, if available:

    • beginningDateTime

      • If there is a beginning and end DateTime:

        • UMMG: TemporalExtent.RangeDateTime.BeginningDateTime
        • ISO: gmd:MD_DataIdentification.gmd:extent.gmd:EX_Extent.gmd:temporalElement.gmd:EX_TemporalExtent.gmd:extent.gml:TimePeriod:gml:beginPosition
      • If not:

        • UMMG: TemporalExtent.SingleDateTime
        • ISO: gmd:MD_DataIdentification.gmd:extent.gmd:EX_Extent.gmd:temporalElement.gmd:EX_TemporalExtent.gmd:extent.gml:TimeInstant.gml:timePosition
    • endingDateTime

      • If there is a beginning and end DateTime:

        • UMMG: TemporalExtent.RangeDateTime.BeginningDateTime
        • ISO: gmd:MD_DataIdentification.gmd:extent.gmd:EX_Extent.gmd:temporalElement.gmd:EX_TemporalExtent.gmd:extent.gml:TimePeriod:gml:beginPosition
      • If not:

        • UMMG: TemporalExtent.SingleDateTime
        • ISO: gmd:MD_DataIdentification.gmd:extent.gmd:EX_Extent.gmd:temporalElement.gmd:EX_TemporalExtent.gmd:extent.gml:TimeInstant.gml:timePosition
    • productionDateTime

      • UMMG: DataGranule.ProductionDateTime
      • ISO: gmd:identificationInfo:gmd:dataQualityInfo.gmd:DQ_DataQuality.gmd:lineage.gmd:LI_Lineage.gmd:processStep.gmi:LE_ProcessStep.gmd:dateTime.gco:DateTime
    • lastUpdateDateTime

      • UMMG:

      Given DataGranule.ProductionDateTime values where Type is in Update, Insert, Create , select most recent value.

      • ISO: Given a node matching gmd:MD_DataIdentification.gmd:citation.gmd:CI_Citation.gmd:title.gco:CharacterString === UpdateTime, use gmd:identificationInfo:gmd:MD_DataIdentification.gmd:citation.gmd:CI_Citation.gmd:date.gmd:CI_Date.gmd:date.gco:DateTime
    - + \ No newline at end of file diff --git a/docs/next/workflows/protocol/index.html b/docs/next/workflows/protocol/index.html index f1685c3d8b9..78920198e91 100644 --- a/docs/next/workflows/protocol/index.html +++ b/docs/next/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: Next

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/next/workflows/workflow-configuration-how-to/index.html b/docs/next/workflows/workflow-configuration-how-to/index.html index b1d5cfb96fd..c4c919f9e5b 100644 --- a/docs/next/workflows/workflow-configuration-how-to/index.html +++ b/docs/next/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -24,7 +24,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.fileName, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()
    note

    The 'move-granules' step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/next/workflows/workflow-triggers/index.html b/docs/next/workflows/workflow-triggers/index.html index b43db0ebf56..3954704f847 100644 --- a/docs/next/workflows/workflow-triggers/index.html +++ b/docs/next/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: Next

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/operator-docs/about-operator-docs/index.html b/docs/operator-docs/about-operator-docs/index.html index 71069e697a7..57dcab2bad1 100644 --- a/docs/operator-docs/about-operator-docs/index.html +++ b/docs/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v15.0.2

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/operator-docs/bulk-operations/index.html b/docs/operator-docs/bulk-operations/index.html index 12b3cb28751..1fe17c6bba5 100644 --- a/docs/operator-docs/bulk-operations/index.html +++ b/docs/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v15.0.2

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/operator-docs/cmr-operations/index.html b/docs/operator-docs/cmr-operations/index.html index c8f4345005b..864110e0e2c 100644 --- a/docs/operator-docs/cmr-operations/index.html +++ b/docs/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/operator-docs/create-rule-in-cumulus/index.html b/docs/operator-docs/create-rule-in-cumulus/index.html index 0d3cf6cfed4..697c7a895a3 100644 --- a/docs/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v15.0.2

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/operator-docs/discovery-filtering/index.html b/docs/operator-docs/discovery-filtering/index.html index 65ebb4056c7..090408bf543 100644 --- a/docs/operator-docs/discovery-filtering/index.html +++ b/docs/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/operator-docs/granule-workflows/index.html b/docs/operator-docs/granule-workflows/index.html index abe0ffe4453..5f02eaddedb 100644 --- a/docs/operator-docs/granule-workflows/index.html +++ b/docs/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v15.0.2

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/operator-docs/kinesis-stream-for-ingest/index.html b/docs/operator-docs/kinesis-stream-for-ingest/index.html index f36435d41c1..e0ffa5af331 100644 --- a/docs/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v15.0.2

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/operator-docs/locating-access-logs/index.html b/docs/operator-docs/locating-access-logs/index.html index 6257ca9cfc1..199a1bcc4d8 100644 --- a/docs/operator-docs/locating-access-logs/index.html +++ b/docs/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v15.0.2

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/operator-docs/naming-executions/index.html b/docs/operator-docs/naming-executions/index.html index ef9f56c4c90..60de7068fb4 100644 --- a/docs/operator-docs/naming-executions/index.html +++ b/docs/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Please note: This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/operator-docs/ops-common-use-cases/index.html b/docs/operator-docs/ops-common-use-cases/index.html index c68153a8a40..3bd16ba51b8 100644 --- a/docs/operator-docs/ops-common-use-cases/index.html +++ b/docs/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/operator-docs/trigger-workflow/index.html b/docs/operator-docs/trigger-workflow/index.html index b2ad6aa2169..22c74125f99 100644 --- a/docs/operator-docs/trigger-workflow/index.html +++ b/docs/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v15.0.2

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/tasks/index.html b/docs/tasks/index.html index 0518aac3e2f..969b8347e30 100644 --- a/docs/tasks/index.html +++ b/docs/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v15.0.2

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/team/index.html b/docs/team/index.html index 113c4202184..ecc1f675621 100644 --- a/docs/team/index.html +++ b/docs/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - +
    Version: v15.0.2

    Cumulus Team

    Cumulus Core Team

    Cumulus Emeritus Team

    - + \ No newline at end of file diff --git a/docs/troubleshooting/index.html b/docs/troubleshooting/index.html index 2ff4373ad9c..0431aea13e0 100644 --- a/docs/troubleshooting/index.html +++ b/docs/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v15.0.2

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/troubleshooting/reindex-elasticsearch/index.html b/docs/troubleshooting/reindex-elasticsearch/index.html index 078756fb969..3abe5f1a2ef 100644 --- a/docs/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/troubleshooting/rerunning-workflow-executions/index.html b/docs/troubleshooting/rerunning-workflow-executions/index.html index ea7f38ecb0f..09e48bdee5b 100644 --- a/docs/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v15.0.2

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/troubleshooting/troubleshooting-deployment/index.html b/docs/troubleshooting/troubleshooting-deployment/index.html index 236604cce52..5f05b49763e 100644 --- a/docs/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/upgrade-notes/cumulus_distribution_migration/index.html b/docs/upgrade-notes/cumulus_distribution_migration/index.html index 96ce21d01cf..8949b9415aa 100644 --- a/docs/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: v15.0.2

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/upgrade-notes/migrate_tea_standalone/index.html b/docs/upgrade-notes/migrate_tea_standalone/index.html index 39eed03316a..16d0aa5c388 100644 --- a/docs/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v15.0.2

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/upgrade-notes/update-cma-2.0.2/index.html b/docs/upgrade-notes/update-cma-2.0.2/index.html index 5395eac46fb..cbf90d3e60e 100644 --- a/docs/upgrade-notes/update-cma-2.0.2/index.html +++ b/docs/upgrade-notes/update-cma-2.0.2/index.html @@ -5,13 +5,13 @@ Upgrade to CMA 2.0.2 | Cumulus Documentation - +
    Version: v15.0.2

    Upgrade to CMA 2.0.2

    Updating a Cumulus Deployment to CMA 2.0.2

    Background

    The Cumulus Message Adapter has been updated in release 2.0.2 to no longer utilize the AWS step function API to look up the defined name of a step function task for population in meta.workflow_tasks, but instead use an incrementing integer field.

    Additionally a bugfix was released in the form of v2.0.1/v2.0.2 following the initial 2.0.0 release, so all users should update to release 2.0.2

    The update is not tied to a particular version of Core, however the update should be done across all task components in order to ensure consistent execution records.

    Changes

    Execution Record Update

    This update functionally means that Cumulus tasks/activities using the CMA will now record a record that looks like the following in meta.workflowtasks, and more importantly in the tasks column for an execution record:

    Original

          "DiscoverGranules": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "QueueGranules": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    New

          "0": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "1": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    Actions Required

    The following should be done as part of a Cumulus stack update to utilize cumulus message adapter > 2.0.2:

    • Python tasks that utilize cumulus-message-adapter-python should be updated to use > 2.0.0, their lambdas rebuilt and Cumulus workflows reconfigured to use the updated version.

    • Python activities that utilize cumulus-process-py should be rebuilt using > 1.0.0 with updated dependencies, and have their images deployed/Cumulus configured to use the new version.

    • The cumulus-message-adapter v2.0.2 lambda layer should be made available in the deployment account, and the Cumulus deployment should be reconfigured to use it (via the cumulus_message_adapter_lambda_layer_version_arn variable in the cumulus module). This should address all Core node.js tasks that utilize the CMA, and many contributed node.js/JAVA components.

    Once the above have been done, redeploy Cumulus to apply the configuration and the updates should be live.

    - + \ No newline at end of file diff --git a/docs/upgrade-notes/update-task-file-schemas/index.html b/docs/upgrade-notes/update-task-file-schemas/index.html index 003fb936c2c..02c06488796 100644 --- a/docs/upgrade-notes/update-task-file-schemas/index.html +++ b/docs/upgrade-notes/update-task-file-schemas/index.html @@ -5,13 +5,13 @@ Updates to task granule file schemas | Cumulus Documentation - +
    Version: v15.0.2

    Updates to task granule file schemas

    Background

    Most Cumulus workflow tasks expect as input a payload of granule(s) which contain the files for each granule. Most tasks also return this same granule structure as output.

    However, up to this point, there was inconsistency in the schemas for the granule files objects expected by each task. Furthermore, there was no guarantee of consistency between granule files objects as stored in the database and the expectations of any given workflow task.

    Thus, when performing bulk granule operations which pass granules from the database into a Cumulus workflow, it was possible for there to be schema validation failures depending on which task was used to start the workflow and its particular schema.

    In order to rectify this situation, CUMULUS-2388 was filed and addressed to create a common granule files schema between nearly all of the Cumulus tasks (exceptions discussed below) and the Cumulus database. The following documentation explains the manual changes you need to make to your deployment in order to be compatible with the updated files schema.

    Updated files schema

    The updated granule files schema can be found here.

    These former properties were deprecated (with notes about how to derive the same information from the updated schema, if possible):

    • filename - concatenate the bucket and key values with a directory separator (/)
    • name - use fileName property
    • etag - ETags are no longer provided as an individual file property. Instead, a separate etags object mapping S3 URIs to ETag values is provided as output from the following workflow tasks (guidance on how to integrate this output with your workflows is provided in the Upgrading your workflows section below):
      • update-granules-cmr-metadata-file-links
      • hyrax-metadata-updates
    • fileStagingDir - no longer supported
    • url_path - no longer supported
    • duplicate_found - This property is no longer supported, however sync-granule and move-granules now produce a separate granuleDuplicates object as part of their output. The granuleDuplicates object is a map of granules by granule ID which includes the files that encountered duplicates during processing. Guidance on how to integrate granuleDuplicates information into your workflow configuration is provided below.

    Exceptions

    These workflow tasks did not have their schema for granule files updated:

    • discover-granules - no updates
    • queue-granules - no updates
    • parse-pdr - no updates
    • sync-granule - input schema not updated, output schema was updated

    The reason that these task schemas were not updated is that all of these tasks start before the files have been ingested to S3, thus much of the information that is required in the updated files schema like bucket, key, or checksum is not yet known.

    Bulk granule operations

    Since the input schema for the above tasks was not updated, that means you cannot run bulk granule operations against workflows if they start with any of those tasks. Bulk granule operations work by loading the specified granules from the database and sending them as input to a specified workflow, so if the specified workflow begins with a task whose input schema does not conform to what is coming out of the database, there will be schema errors.

    Upgrading your deployment

    Upgrading your workflows

    For any workflows using the update-granules-cmr-metadata-file-links task before the hyrax-metadata-updates and/or post-to-cmr tasks, update the step definition for update-granules-cmr-metadata-file-links as follows:

        "UpdateGranulesCmrMetadataFileLinksStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    hyrax-metadata-updates

    For any workflows using the hyrax-metadata-updates task before a post-to-cmr task, update the definition of the hyrax-metadata-updates step as follows:

        "HyraxMetadataUpdatesTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    post-to-cmr

    For any workflows using post-to-cmr task after the update-granules-cmr-metadata-file-links or hyrax-metadata-updates tasks, update the post-to-cmr step definition as follows:

        "CmrStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}"
    }
    }
    },
    ...more configuration...

    Example workflow

    For an example workflow integrating all of these changes, please see our example ingest and publish workflow.

    Optional - Integrate granuleDuplicates information

    Please note that the granuleDuplicates output is purely informational and does not have any bearing on the separate configuration for how duplicates should be handled.

    You can include granuleDuplicates output from the sync-granule or move-granules tasks in your workflow messages like so:

        "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    ...other config...
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granuleDuplicates}",
    "destination": "{$.meta.sync_granule.granule_duplicates}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    }
    ...more configuration...

    The result of this configuration is that the granuleDuplicates output from sync-granule would be placed in meta.sync_granule.granule_duplicates on the workflow message and remain there throughout the rest of the workflow. The same configuration could be replicated for the move-granules task, but be sure to use a different destination in the workflow message for the granuleDuplicates output .

    Updating collection URL path templates

    Collections can specify url_path templates to dynamically generate the final location of files. As part of url_path templates, file object properties can be interpolated to generate the file path. Thus, these url_path templates need to be updated to ensure that they are compatible with the updated files schema and the properties that will actually be available on file objects.

    See the notes on the updated files schema to know which properties are available and which previously existing properties were deprecated.

    As an example, you will want to update any url_path properties in your collections to remove references to file.name and replace them with references to file.fileName like so:

    - "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.name, 0, 3)}",
    + "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.fileName, 0, 3)}",
    - + \ No newline at end of file diff --git a/docs/upgrade-notes/upgrade-rds/index.html b/docs/upgrade-notes/upgrade-rds/index.html index 4206725741b..dcad11b9d85 100644 --- a/docs/upgrade-notes/upgrade-rds/index.html +++ b/docs/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/upgrade-notes/upgrade_tf_version_0.13.6/index.html index b3bd7c803cb..ea6d33b7321 100644 --- a/docs/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v15.0.2

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the upgrade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/adding-a-task/index.html b/docs/v10.0.0/adding-a-task/index.html index aed02124a54..7e51223b624 100644 --- a/docs/v10.0.0/adding-a-task/index.html +++ b/docs/v10.0.0/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: v10.0.0

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/api/index.html b/docs/v10.0.0/api/index.html index 378fe90d05e..aa2051194d4 100644 --- a/docs/v10.0.0/api/index.html +++ b/docs/v10.0.0/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v10.0.0/architecture/index.html b/docs/v10.0.0/architecture/index.html index 27ecfe33976..d2234c0b272 100644 --- a/docs/v10.0.0/architecture/index.html +++ b/docs/v10.0.0/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: v10.0.0

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a set of DynamoDB database tables, and is exported to an ElasticSearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Database

    Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance. For legacy reasons, Cumulus is currently also deploying a DynamoDB datastore, and writes are replicated in parallel with the PostgreSQL database writes. The DynamoDB replicated tables and parallel writes will be removed in future releases.

    PostgreSQL Database Schema Diagram

    ERD of the Cumulus Database

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/v10.0.0/configuration/cloudwatch-retention/index.html b/docs/v10.0.0/configuration/cloudwatch-retention/index.html index 4d3ba8a65da..8b0320de613 100644 --- a/docs/v10.0.0/configuration/cloudwatch-retention/index.html +++ b/docs/v10.0.0/configuration/cloudwatch-retention/index.html @@ -5,13 +5,13 @@ Cloudwatch Retention | Cumulus Documentation - +
    Version: v10.0.0

    Cloudwatch Retention

    Our lambdas dump logs to AWS CloudWatch. By default, these logs exist indefinitely. However, there are ways to specify a duration for log retention.

    aws-cli

    In addition to getting your aws-cli set-up, there are two values you'll need to acquire.

    1. log-group-name: the name of the log group who's retention policy (retention time) you'd like to change. We'll use /aws/lambda/KinesisInboundLogger in our examples.
    2. retention-in-days: the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    For example, if we wanted to set log retention to 30 days on our KinesisInboundLogger lambda, we would write:

    aws logs put-retention-policy --log-group-name "/aws/lambda/KinesisInboundLogger" --retention-in-days 30

    Note: The aws-cli log command that we're using is explained in detail here.

    AWS Management Console

    Changing the log retention policy in the AWS Management Console is a fairly simple process:

    1. Navigate to the CloudWatch service in the AWS Management Console.
    2. Click on the Logs entry on the sidebar.
    3. Find the Log Group who's retention policy you're interested in changing.
    4. Click on the value in the Expire Events After column.
    5. Enter/Select the number of days you'd like to retain logs in that log group for.

    Screenshot of AWS console showing how to configure the retention period for Cloudwatch logs

    - + \ No newline at end of file diff --git a/docs/v10.0.0/configuration/collection-storage-best-practices/index.html b/docs/v10.0.0/configuration/collection-storage-best-practices/index.html index 0f662d3e900..6629f319fc2 100644 --- a/docs/v10.0.0/configuration/collection-storage-best-practices/index.html +++ b/docs/v10.0.0/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: v10.0.0

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/v10.0.0/configuration/data-management-types/index.html b/docs/v10.0.0/configuration/data-management-types/index.html index dbd6cfccc2a..876d48e99e2 100644 --- a/docs/v10.0.0/configuration/data-management-types/index.html +++ b/docs/v10.0.0/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: v10.0.0

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    Please note:

    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certificateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certiciateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22
    privateKeystringNofilename assumed to be in s3://bucketInternal/stackName/crypto
    cmKeyIdstringNoAWS KMS Customer Master Key arn or alias

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.fileName, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.fileName, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v10.0.0/configuration/lifecycle-policies/index.html b/docs/v10.0.0/configuration/lifecycle-policies/index.html index 949a5689688..f956ccc356e 100644 --- a/docs/v10.0.0/configuration/lifecycle-policies/index.html +++ b/docs/v10.0.0/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: v10.0.0

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walk-through on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/v10.0.0/configuration/monitoring-readme/index.html b/docs/v10.0.0/configuration/monitoring-readme/index.html index 223a2a2bece..f67aaedff84 100644 --- a/docs/v10.0.0/configuration/monitoring-readme/index.html +++ b/docs/v10.0.0/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: v10.0.0

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return await Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/configuration/server_access_logging/index.html b/docs/v10.0.0/configuration/server_access_logging/index.html index 41735ccd9ba..ace7ea6d9dd 100644 --- a/docs/v10.0.0/configuration/server_access_logging/index.html +++ b/docs/v10.0.0/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: v10.0.0

    S3 Server Access Logging

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/v10.0.0/configuration/task-configuration/index.html b/docs/v10.0.0/configuration/task-configuration/index.html index 10299f32c65..81b72ceef09 100644 --- a/docs/v10.0.0/configuration/task-configuration/index.html +++ b/docs/v10.0.0/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
    Version: v10.0.0

    Configuration of Tasks

    The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

    cmr_search_client_config

    Configuration parameters for CMR search client for cumulus archive module tasks in the form:

    <lambda_identifier>_report_cmr_limit = <maximum number records can be returned from cmr-client search, this should be greater than cmr_page_size>
    <lambda_identifier>_report_cmr_page_size = <number of records for each page returned from CMR>
    type = map(string)

    More information about cmr limit and cmr page_size can be found from @cumulus/cmr-client and CMR Search API document.

    Currently the following values are supported:

    • create_reconciliation_report_cmr_limit
    • create_reconciliation_report_cmr_page_size

    Example

    cmr_search_client_config = {
    create_reconciliation_report_cmr_limit = 2500
    create_reconciliation_report_cmr_page_size = 250
    }

    elasticsearch_client_config

    Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

    <lambda_identifier>_es_scroll_duration = <duration>
    <lambda_identifier>_es_scroll_size = <size>
    type = map(string)

    Currently the following values are supported:

    • create_reconciliation_report_es_scroll_duration
    • create_reconciliation_report_es_scroll_size

    Example

    elasticsearch_client_config = {
    create_reconciliation_report_es_scroll_duration = "15m"
    create_reconciliation_report_es_scroll_size = 2000
    }

    lambda_timeouts

    A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_timeout: <timeout>
    type = map(string)

    Currently the following values are supported:

    • discover_granules_task_timeout
    • discover_pdrs_task_timeout
    • hyrax_metadata_update_tasks_timeout
    • lzards_backup_task_timeout
    • move_granules_task_timeout
    • parse_pdr_task_timeout
    • pdr_status_check_task_timeout
    • post_to_cmr_task_timeout
    • queue_granules_task_timeout
    • queue_pdrs_task_timeout
    • queue_workflow_task_timeout
    • sync_granule_task_timeout
    • update_granules_cmr_metadata_file_links_task_timeout

    Example

    lambda_timeouts = {
    discover_granules_task_timeout = 300
    }
    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/about-cookbooks/index.html b/docs/v10.0.0/data-cookbooks/about-cookbooks/index.html index 459eef232a2..923f615f304 100644 --- a/docs/v10.0.0/data-cookbooks/about-cookbooks/index.html +++ b/docs/v10.0.0/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: v10.0.0

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to configure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/browse-generation/index.html b/docs/v10.0.0/data-cookbooks/browse-generation/index.html index 3ee60a2791d..19e368490d0 100644 --- a/docs/v10.0.0/data-cookbooks/browse-generation/index.html +++ b/docs/v10.0.0/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg"
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/choice-states/index.html b/docs/v10.0.0/data-cookbooks/choice-states/index.html index 7a31413c7fa..a0d445ee1eb 100644 --- a/docs/v10.0.0/data-cookbooks/choice-states/index.html +++ b/docs/v10.0.0/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: v10.0.0

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/cnm-workflow/index.html b/docs/v10.0.0/data-cookbooks/cnm-workflow/index.html index 08ff9d56173..9a44c2b2543 100644 --- a/docs/v10.0.0/data-cookbooks/cnm-workflow/index.html +++ b/docs/v10.0.0/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v10.0.0

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/kinesis_trigger_test_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }

    Please Note:

    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

    Please note: The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/error-handling/index.html b/docs/v10.0.0/data-cookbooks/error-handling/index.html index b12c977ad86..46a173eda35 100644 --- a/docs/v10.0.0/data-cookbooks/error-handling/index.html +++ b/docs/v10.0.0/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/hello-world/index.html b/docs/v10.0.0/data-cookbooks/hello-world/index.html index 6434aeffd08..d21afebeb9d 100644 --- a/docs/v10.0.0/data-cookbooks/hello-world/index.html +++ b/docs/v10.0.0/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: v10.0.0

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/ingest-notifications/index.html b/docs/v10.0.0/data-cookbooks/ingest-notifications/index.html index 839b57eef12..6d99f21cb4c 100644 --- a/docs/v10.0.0/data-cookbooks/ingest-notifications/index.html +++ b/docs/v10.0.0/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: v10.0.0

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics are created and used for handling notification messages related to the workflow.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates DynamoDB. The DynamoDB events for the ExecutionsTable, GranulesTable and PdrsTable are streamed on DynamoDBStreams, which are read by the publishExecutions, publishGranules and publishPdrs Lambda functions, respectively.

    These Lambda functions publish to the three SNS topics both when the workflow starts and when it reaches a terminal state (completion or failure). The following describes how many message(s) each topic receives both on workflow start and workflow completion/failure:

    • reportExecutions - Receives 1 message per workflow execution
    • reportGranules - Receives 1 message per granule in a workflow execution
    • reportPdrs - Receives 1 message per PDR

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/queue-post-to-cmr/index.html b/docs/v10.0.0/data-cookbooks/queue-post-to-cmr/index.html index f2dd3ae2045..c244ad914ad 100644 --- a/docs/v10.0.0/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/v10.0.0/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: v10.0.0

    Queue PostToCmr

    In this document, we walk through handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/v10.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index 4b92c679a2d..941b58baf1e 100644 --- a/docs/v10.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/v10.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: v10.0.0

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.7.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn
    ]
    alarms = {
    MemoryUtilizationHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 75
    }
    }
    }

    Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/sips-workflow/index.html b/docs/v10.0.0/data-cookbooks/sips-workflow/index.html index efbb129493d..86724797518 100644 --- a/docs/v10.0.0/data-cookbooks/sips-workflow/index.html +++ b/docs/v10.0.0/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/throttling-queued-executions/index.html b/docs/v10.0.0/data-cookbooks/throttling-queued-executions/index.html index 4f338e75a52..cdb01b44967 100644 --- a/docs/v10.0.0/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/v10.0.0/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: v10.0.0

    Throttling queued executions

    In this entry, we will walk through how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/v10.0.0/data-cookbooks/tracking-files/index.html b/docs/v10.0.0/data-cookbooks/tracking-files/index.html index 190af137bc7..5d3100ca238 100644 --- a/docs/v10.0.0/data-cookbooks/tracking-files/index.html +++ b/docs/v10.0.0/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/api-gateway-logging/index.html b/docs/v10.0.0/deployment/api-gateway-logging/index.html index 27b3b0dcb56..54f222b5cb4 100644 --- a/docs/v10.0.0/deployment/api-gateway-logging/index.html +++ b/docs/v10.0.0/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: v10.0.0

    API Gateway Logging

    Enabling API Gateway logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions for enabling account level logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    Create a policy document

    The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

    Save this snippet as apigateway-policy.json.

    {
    "Version": "2012-10-17",
    "Statement": [
    {
    "Sid": "",
    "Effect": "Allow",
    "Principal": {
    "Service": "apigateway.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
    }
    ]
    }

    Create an account role to act as ApiGateway and write to CloudWatchLogs

    NASA users in NGAP: be sure to use your account's permission boundary.

    aws iam create-role \
    --role-name ApiGatewayToCloudWatchLogs \
    [--permissions-boundary <permissionBoundaryArn>] \
    --assume-role-policy-document file://apigateway-policy.json

    Note the ARN of the returned role for the last step.

    Attach correct permissions to role

    Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

    aws iam attach-role-policy \
    --role-name ApiGatewayToCloudWatchLogs \
    --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"

    Update Account API Gateway settings with correct permissions

    Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

    aws apigateway update-account \
    --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    See Configure Cloudwatch Logs Delivery

    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/cloudwatch-logs-delivery/index.html b/docs/v10.0.0/deployment/cloudwatch-logs-delivery/index.html index 9496f7f5cfb..8c54682f0e4 100644 --- a/docs/v10.0.0/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/v10.0.0/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: v10.0.0

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    Be default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/components/index.html b/docs/v10.0.0/deployment/components/index.html index abce57d6a2d..0089667ce92 100644 --- a/docs/v10.0.0/deployment/components/index.html +++ b/docs/v10.0.0/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walk-through of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/create_bucket/index.html b/docs/v10.0.0/deployment/create_bucket/index.html index e76a578c3b3..ea6db5c38b4 100644 --- a/docs/v10.0.0/deployment/create_bucket/index.html +++ b/docs/v10.0.0/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: v10.0.0

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command line

    Using the AWS command line tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }

    Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web interface

    See: AWS "Creating a Bucket" documentation

    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/cumulus_distribution/index.html b/docs/v10.0.0/deployment/cumulus_distribution/index.html index 4f132a65948..e9a47edfc8d 100644 --- a/docs/v10.0.0/deployment/cumulus_distribution/index.html +++ b/docs/v10.0.0/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
    Version: v10.0.0

    Using the Cumulus Distribution API

    The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

    Configuring a Cumulus Distribution deployment

    The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

    These steps assume you're using the Cumulus Deployment Template but can also be used for custom deployments.

    To configure a deployment to use Cumulus Distribution:

    1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the Cumulus Distribution settings.
    2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
    3. Uncomment the Cumulus Distribution outputs in outputs.tf.
    4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

    Cognito Application and User Credentials

    The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

    Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

    Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

    • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
    • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
    • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

    Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

    Cumulus Distribution URL

    Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

    By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

    In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

    You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file, and setting it to one of the following values (both of which are explained below):

    1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
    2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

    The following subsections explain these approaches, in turn.

    Using your Cumulus Distribution API Gateway URL as your distribution URL

    Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development), as described here. Here is an outline of the required steps, with details provided further below:

    1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
    2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
    3. Choose an open local port on your machine (we'll use 9000 in the following details)
    4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
    5. Redeploy Cumulus
    6. Add an entry to your /etc/hosts file
    7. Add a redirect URI to Cognito, via the Cognito API
    8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
    9. Add a sample file to S3 to test downloading via Cognito

    To create or import an existing key pair, you can use the AWS CLI (see aws ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

    Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

    key_name = "<name>"
    cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

    where:

    • <name> is the name of the key pair you just added to AWS
    • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
    • <port> is your open local port of choice (9000 is typically a good choice)

    Once you save your variable changes, redeploy your cumulus-tf module.

    While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

    localhost <hostname>

    Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

    • method: POST
    • base URL: the value of your csdap_host_url Terraform variable
    • path: /authclient/updateRedirectUri
    • username: the value of your csdap_client_id Terraform variable
    • password: the value of your csdap_client_password Terraform variable
    • headers: Content-Type='application/x-www-form-urlencoded'
    • body: redirect_uri=<cumulus_distribution_url>/login

    where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

    For reference, see the Cognito Authentication Service API.

    Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

    brew install --cask session-manager-plugin --no-quarantine

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

    1. Determine your ec2 instance ID
    2. Connect to the NASA VPN
    3. Start an AWS SSM session
    4. Open an ssh tunnel
    5. Use a browser to navigate to your file

    To determine your ec2 instance ID for your Cumulus deployment, run the follow command, where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

    aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text

    IMPORTANT: Before proceeding with the remaining steps, make sure you're connected to the NASA VPN.

    Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

    aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

    If successful, you should see output similar to the following:

    Starting session with SessionId: NGAPShApplicationDeveloper-***
    Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
    Waiting for connections...

    Open another terminal window, and open a tunnel with port forwarding, using your chosen port from above (e.g., 9000):

    ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

    where:

    • <port> is the open local port you chose earlier (e.g., 9000)
    • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    Once you're finished testing, clean up as follows:

    1. Kill your ssh tunnel (Ctrl-C)
    2. Kill your AWS SSM session (Ctrl-C)
    3. If you like, disconnect from the NASA VPC

    While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

    Using a CloudFront URL as your distribution URL

    In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

    This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

    • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
    • A request to add the AWS account's VPC to the whitelist

    Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

    cumulus_distribution_url = <cloudfront_url>

    In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

    At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

    Redeploy Cumulus with your new/updated Terraform variables.

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    S3 Bucket Mapping

    An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

    The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Note: Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Switching from the Thin Egress App to Cumulus Distribution

    If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

    Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

    There will also be downtime while your API gateway is updated.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/index.html b/docs/v10.0.0/deployment/index.html index 3ae08c139cd..bed1876ea99 100644 --- a/docs/v10.0.0/deployment/index.html +++ b/docs/v10.0.0/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -21,7 +21,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Choose a distribution API

    Cumulus can be configured to use either the Thin Egress App (TEA) or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

    IMPORTANT! If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    Configure the Thin Egress App

    The Thin Egress App can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the documentation to configure distribution in your cumulus-tf deployment.

    Configure the Cumulus Distribution API (optional)

    If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

    Note: Be sure to copy the redirect URLs, as you will use them to update your Earthdata application.

    Update Earthdata Application

    You will need to add two redirect URLs to your EarthData login application.

    1. Login to URS.
    2. Under My Applications -> Application Administration -> use the edit icon of your application.
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token.
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1.
    5. You may delete the placeholder url you used to create the application.

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus dashboard

    Dashboard Requirements

    Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the dashboard repository.

    Prepare AWS

    Create S3 bucket for dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install dashboard

    To install the dashboard, clone the Cumulus dashboard repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard versioning

    By default, the master branch will be used for dashboard deployments. The master branch of the dashboard repo contains the most recent stable release of the dashboard.

    If you want to test unreleased changes to the dashboard, use the develop branch.

    Each release/version of the dashboard will have a tag in the dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the dashboard

    Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build the dashboard from the dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard deployment

    Deploy dashboard to s3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard --acl public-read

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and login with a user that you configured for access in the Configure and Deploy the Cumulus Stack step.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 instances and autoscaling groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/postgres_database_deployment/index.html b/docs/v10.0.0/deployment/postgres_database_deployment/index.html index 8699d7ef30c..192746b2288 100644 --- a/docs/v10.0.0/deployment/postgres_database_deployment/index.html +++ b/docs/v10.0.0/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 10.2 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases, and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs, and the Aurora Serverless V1 docs.

    Prepare deployment repository

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a new repository

    Create a new repository on Github so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

    --

    Configure and deploy the module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc. This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision user and user database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true, this configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to
    • prefix -- this value will be used to set a unique identifier the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda, and run it on each provision, creating the configured database if it does not exist, updating the user password if that value has been changed, and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    Please Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see output like:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secret required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are is in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Next Steps

    The database cluster has been created/updated! From here you can continue to add additional user accounts, databases and other database configuration.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/share-s3-access-logs/index.html b/docs/v10.0.0/deployment/share-s3-access-logs/index.html index 8cb14a75436..e4ede24833c 100644 --- a/docs/v10.0.0/deployment/share-s3-access-logs/index.html +++ b/docs/v10.0.0/deployment/share-s3-access-logs/index.html @@ -5,14 +5,14 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: v10.0.0

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a node package that contains a simple lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First ensure that you have enabled S3 Server Access Logging.

    Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition. e.g.

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The terraform source package can be found on the Cumulus github release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 replicator as described above using the target_bucket and target_prefix provided by the metrics team.

    The metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/terraform-best-practices/index.html b/docs/v10.0.0/deployment/terraform-best-practices/index.html index fb0fc676e26..74a8c5ddeb5 100644 --- a/docs/v10.0.0/deployment/terraform-best-practices/index.html +++ b/docs/v10.0.0/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    Configuring the Cumulus deployment: link Restoring a previous version: link

    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/thin_egress_app/index.html b/docs/v10.0.0/deployment/thin_egress_app/index.html index bfa202a80fa..f2850a65a5f 100644 --- a/docs/v10.0.0/deployment/thin_egress_app/index.html +++ b/docs/v10.0.0/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v10.0.0

    Using the Thin Egress App for Cumulus distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a secret for signing Thin Egress App JWTs

    The Thin Egress App uses JWTs internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Please note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally configure a custom bucket map

    A simple config would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public

    Please note: your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally configure shared variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus core example deployment code.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/deployment/upgrade-readme/index.html b/docs/v10.0.0/deployment/upgrade-readme/index.html index f4ea2aad6d2..9f1c2897c23 100644 --- a/docs/v10.0.0/deployment/upgrade-readme/index.html +++ b/docs/v10.0.0/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/development/forked-pr/index.html b/docs/v10.0.0/development/forked-pr/index.html index 6337dd0352e..47ffcfb4cc8 100644 --- a/docs/v10.0.0/development/forked-pr/index.html +++ b/docs/v10.0.0/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: v10.0.0

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/development/integration-tests/index.html b/docs/v10.0.0/development/integration-tests/index.html index 7d08c10d4cd..f6cd21ae88a 100644 --- a/docs/v10.0.0/development/integration-tests/index.html +++ b/docs/v10.0.0/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/development/quality-and-coverage/index.html b/docs/v10.0.0/development/quality-and-coverage/index.html index 1635415d294..5167808b625 100644 --- a/docs/v10.0.0/development/quality-and-coverage/index.html +++ b/docs/v10.0.0/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/development/release/index.html b/docs/v10.0.0/development/release/index.html index 6285a105b6e..d3d99de59f6 100644 --- a/docs/v10.0.0/development/release/index.html +++ b/docs/v10.0.0/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v10.0.0

    Versioning and Releases

    Versioning

    We use a global versioning approach, meaning version numbers in cumulus are consistent across all packages and tasks, and semantic versioning to track major, minor, and patch version (i.e. 1.0.0). We use Lerna to manage our versioning. Any change will force lerna to increment the version of all packages.

    Read more about the semantic versioning here.

    Pre-release testing

    Note: This is only necessary when preparing a release for a new major version of Cumulus (e.g. preparing to go from 6.x.x to 7.0.0)

    Before releasing a new major version of Cumulus, we should test the deployment upgrade path from the latest release of Cumulus to the upcoming release.

    It is preferable to use the cumulus-template-deploy repo for testing the deployment, since that repo is the officially recommended deployment configuration for end users.

    You should create an entirely new deployment for this testing to replicate the end user upgrade path. Using an existing test or CI deployment would not be useful because that deployment may already have been deployed with the latest changes and not match the upgrade path for end users.

    Pre-release testing steps:

    1. Checkout the cumulus-template-deploy repo

    2. Update the deployment code to use the latest release artifacts if it wasn't done already. For example, assuming that the latest release was 5.0.1, update the deployment files as follows:

      # in data-persistence-tf/main.tf
      source = "https://github.com/nasa/cumulus/releases/download/v5.0.1/terraform-aws-cumulus.zip//tf-modules/data-persistence"

      # in cumulus-tf/main.tf
      source = "https://github.com/nasa/cumulus/releases/download/v5.0.1/terraform-aws-cumulus.zip//tf-modules/cumulus"
    3. For both the data-persistence-tf and cumulus-tf modules:

      1. Add the necessary backend configuration (terraform.tf) and variables (terraform.tfvars)
        • You should use an entirely new deployment for this testing, so make sure to use values for key in terraform.tf and prefix in terraform.tfvars that don't collide with existing deployments
      2. Run terraform init
      3. Run terraform apply
    4. Checkout the master branch of the cumulus repo

    5. Run a full bootstrap of the code: npm run bootstrap

    6. Build the pre-release artifacts: ./bamboo/create-release-artifacts.sh

    7. For both the data-persistence-tf and cumulus-tf modules:

      1. Update the deployment to use the built release artifacts:

        # in data-persistence-tf/main.tf
        source = "[path]/cumulus/terraform-aws-cumulus.zip//tf-modules/data-persistence"

        # in cumulus-tf/main.tf
        source = "/Users/mboyd/development/cumulus/terraform-aws-cumulus.zip//tf-modules/cumulus"
      2. Review the CHANGELOG.md for any pre-deployment migration steps. If there are, go through the steps and confirm that they are successful

      3. Run terraform init

      4. Run terraform apply

    8. Review the CHANGELOG.md for any post-deployment migration steps and confirm that they are successful

    9. Delete your test deployment by running terraform destroy in cumulus-tf and data-persistence-tf

    Updating Cumulus version and publishing to NPM

    1. Create a branch for the new release

    From Master

    Create a branch titled release-MAJOR.MINOR.x for the release (use a literal x for the patch version).

        git checkout -b release-MAJOR.MINOR.x

    e.g.:
    git checkout -b release-9.1.x

    If creating a new major version release from master, say 5.0.0, then the branch would be named release-5.0.x. If creating a new minor version release from master, say 1.14.0 then the branch would be named release-1.14.x.

    Having a release branch for each major/minor version allows us to easily backport patches to that version.

    Push the release-MAJOR.MINOR.x branch to GitHub if it was created locally. (Commits should be even with master at this point.)

    If creating a patch release, you can check out the existing base branch.

    Then create the release branch (e.g. release-1.14.0) from the minor version base branch. For example, from the release-1.14.x branch:

    git checkout -b release-1.14.0

    Backporting

    When creating a backport, a minor version base branch should already exist on GitHub. Check out the existing minor version base branch then create a release branch from it. For example:

    # check out existing minor version base branch
    git checkout release-1.14.x
    # pull to ensure you have the latest changes
    git pull origin release-1.14.x
    # create new release branch for backport
    git checkout -b release-1.14.1
    # cherry pick the commits (or single squashed commit of changes) relevant to the backport
    git cherry-pick [replace-with-commit-SHA]
    # push up the changes to the release branch
    git push

    2. Update the Cumulus version number

    When changes are ready to be released, the Cumulus version number must be updated.

    Lerna handles the process of deciding which version number should be used as long as the developer specifies whether the change is a major, minor, or patch change.

    To update Cumulus's version number run:

    npm run update

    Screenshot of terminal showing interactive prompt from Lerna for selecting the new release version

    Lerna will handle updating the packages and all of the dependent package version numbers. If a dependency has not been changed with the update, however, lerna will not update the version of the dependency.

    Note: Lerna will struggle to correctly update the versions on any non-standard/alpha versions (e.g. 1.17.0-alpha0). Please be sure to check any packages that are new or have been manually published since the previous release and any packages that list it as a dependency to ensure the listed versions are correct. It's useful to use the search feature of your code editor or grep to see if there any references to outdated package versions.

    3. Check Cumulus Dashboard PRs for Version Bump

    There may be unreleased changes in the Cumulus Dashboard project that rely on this unreleased Cumulus Core version.

    If there is exists a PR in the cumulus-dashboard repo with a name containing: "Version Bump for Next Cumulus API Release":

    • There will be a placeholder change-me value that should be replaced with the Cumulus Core to-be-released-version.
    • Mark that PR as ready to be reviewed.

    4. Update CHANGELOG.md

    Update the CHANGELOG.md. Put a header under the Unreleased section with the new version number and the date.

    Add a link reference for the github "compare" view at the bottom of the CHANGELOG.md, following the existing pattern. This link reference should create a link in the CHANGELOG's release header to changes in the corresponding release.

    5. Update DATA_MODEL_CHANGELOG.md

    Similar to #4, make sure the DATA_MODEL_CHANGELOG is updated if there are data model changes in the release, and the link reference at the end of the document is updated as appropriate.

    6. Update CONTRIBUTORS.md

    ./bin/update-contributors.sh
    git add CONTRIBUTORS.md

    Commit and push these changes, if any.

    7. Update Cumulus package API documentation

    Update auto-generated API documentation for any Cumulus packages that have it:

    npm run docs-build-packages

    Commit and push these changes, if any.

    8. Cut new version of Cumulus Documentation

    If this is a backport, do not create a new version of the documentation. For various reasons, we do not merge backports back to master, other than changelog notes. Documentation changes for backports will not be published to our documentation website.

    cd website
    npm run version ${release_version}
    git add .

    Where ${release_version} corresponds to the version tag v1.2.3, for example.

    Commit and push these changes.

    9. Create a pull request against the minor version branch

    1. Push the release branch (e.g. release-1.2.3) to GitHub.

    2. Create a PR against the minor version base branch (e.g. release-1.2.x).

    3. Configure Bamboo to run automated tests against this PR by finding the branch plan for the release branch (release-1.2.3) and setting only these variables:

      • GIT_PR: true
      • SKIP_AUDIT: true

      IMPORTANT: Do NOT set the PUBLISH_FLAG variable to true for this branch plan. The actual publishing of the release will be handled by a separate, manually triggered branch plan.

      Screenshot of Bamboo CI interface showing the configuration of the GIT_PR branch variable to have a value of &quot;true&quot;

    4. Verify that the Bamboo build for the PR succeeds and then merge to the minor version base branch (release-1.2.x).

      • It is safe to do a squash merge in this instance, but not required
    5. You may delete your release branch (release-1.2.3) after merging to the base branch.

    10. Create a git tag for the release

    Check out the minor version base branch now that your changes are merged in and do a git pull.

    Ensure you are on the latest commit.

    Create and push a new git tag:

        git tag -a vMAJOR.MINOR.PATCH -m "Release MAJOR.MINOR.PATCH"
    git push origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -a v9.1.0 -m "Release 9.1.0"
    git push origin v9.1.0

    11. Publishing the release

    Publishing of new releases is handled by a custom Bamboo branch plan and is manually triggered.

    The reasons for using a separate branch plan to handle releases instead of the branch plan for the minor version (e.g. release-1.2.x) are:

    • The Bamboo build for the minor version release branch is triggered automatically on any commits to that branch, whereas we want to manually control when the release is published.
    • We want to verify that integration tests have passed on the Bamboo build for the minor version release branch before we manually trigger the release, so that we can be sure that our code is safe to release.

    If this is a new minor version branch, then you will need to create a new Bamboo branch plan for publishing the release following the instructions below:

    Creating a Bamboo branch plan for the release

    • In the Cumulus Core project (https://ci.earthdata.nasa.gov/browse/CUM-CBA), click Actions -> Configure Plan in the top right.

    • Next to Plan branch click the rightmost button that displays Create Plan Branch upon hover.

    • Click Create plan branch manually.

    • Add the values in that list. Choose a display name that makes it very clear this is a deployment branch plan. Release (minor version branch name) seems to work well (e.g. Release (1.2.x))).

      • Make sure you enter the correct branch name (e.g. release-1.2.x).
    • Important Deselect Enable Branch - if you do not do this, it will immediately fire off a build.

    • Do Immediately On the Branch Details page, enable Change trigger. Set the Trigger type to manual, this will prevent commits to the branch from triggering the build plan. You should have been redirected to the Branch Details tab after creating the plan. If not, navigate to the branch from the list where you clicked Create Plan Branch in the previous step.

    • Go to the Variables tab. Ensure that you are on your branch plan and not the master plan: You should not see a large list of configured variables, but instead a dropdown allowing you to select variables to override, and the tab title will be Branch Variables. Then set the branch variables as follow:

      • DEPLOYMENT: cumulus-from-npm-tf (except in special cases such as incompatible backport branches)
        • If this variable is not set, it will default to the deployment name for the last committer on the branch
      • USE_CACHED_BOOTSTRAP: false
      • USE_TERRAFORM_ZIPS: true (IMPORTANT: MUST be set in order to run integration tests against the .zip files published during the build so that we are actually testing our released files)
      • GIT_PR: true
      • SKIP_AUDIT: true
      • PUBLISH_FLAG: true
    • Enable the branch from the Branch Details page.

    • Run the branch using the Run button in the top right.

    Bamboo will build and run lint, audit and unit tests against that tagged release, publish the new packages to NPM, and then run the integration tests using those newly released packages.

    12. Create a new Cumulus release on github

    The CI release scripts will automatically create a GitHub release based on the release version tag, as well as upload artifacts to the Github release for the Terraform modules provided by Cumulus. The Terraform release artifacts include:

    • A multi-module Terraform .zip artifact containing filtered copies of the tf-modules, packages, and tasks directories for use as Terraform module sources.
    • A S3 replicator module
    • A workflow module
    • A distribution API module
    • An ECS service module

    Just make sure to verify the appropriate .zip files are present on Github after the release process is complete.

    13. Merge base branch back to master

    Finally, you need to reproduce the version update changes back to master.

    If this is the latest version, you can simply create a PR to merge the minor version base branch back to master.

    Do not merge master back into the release branch since we want the release branch to just have the code from the release. Instead, create a new branch off of the release branch and merge that to master. You can freely merge master into this branch and delete it when it is merged to master.

    If this is a backport, you will need to create a PR that ports the changelog updates back to master. It is important in this changelog note to call it out as a backport. For example, fixes in backport version 1.14.5 may not be available in 1.15.0 because the fix was introduced in 1.15.3.

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d vMAJOR.MINOR.PATCH
    git push -d origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -d v9.1.0
    git push -d origin v9.1.0
    - + \ No newline at end of file diff --git a/docs/v10.0.0/docs-how-to/index.html b/docs/v10.0.0/docs-how-to/index.html index 75d1804320c..f6b01bf1c58 100644 --- a/docs/v10.0.0/docs-how-to/index.html +++ b/docs/v10.0.0/docs-how-to/index.html @@ -5,13 +5,13 @@ Cumulus Documentation: How To's | Cumulus Documentation - +
    Version: v10.0.0

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve

    Note: docs-build will build the documents into website/build.

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---

    Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walk-through can be found here. It is worth noting that we would like the Documentation versions to match up directly with release versions. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiKey and an indexName by DocSearch that we include in our website/siteConfig.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for both these values to exist - DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/external-contributions/index.html b/docs/v10.0.0/external-contributions/index.html index c06fb341162..0e473224738 100644 --- a/docs/v10.0.0/external-contributions/index.html +++ b/docs/v10.0.0/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: v10.0.0

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/faqs/index.html b/docs/v10.0.0/faqs/index.html index e87f4f1ba09..e18e30a761b 100644 --- a/docs/v10.0.0/faqs/index.html +++ b/docs/v10.0.0/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: v10.0.0

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    What prerequisites are needed to setup Cumulus?

    Answer: You will need access to the AWS console and an Earthdata login before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a JIRA issue.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    Is Cumulus customizable and flexible?

    Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

    What are Terraform modules?

    Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

    Where do I find Terraform module variables

    Answer: Go here for a list of Cumulus maintained variables.

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting here.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/ancillary_metadata/index.html b/docs/v10.0.0/features/ancillary_metadata/index.html index cda6c0ad118..a734cb2f20d 100644 --- a/docs/v10.0.0/features/ancillary_metadata/index.html +++ b/docs/v10.0.0/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v10.0.0

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/backup_and_restore/index.html b/docs/v10.0.0/features/backup_and_restore/index.html index 6d445eb2a00..8214bbe97c0 100644 --- a/docs/v10.0.0/features/backup_and_restore/index.html +++ b/docs/v10.0.0/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -71,7 +71,7 @@ utilize the new cluster/security groups and redeploy.

    DynamoDB

    Backup and Restore with AWS

    You can enable point-in-time recovery (PITR) as well as create an on-demand backup for your Amazon DynamoDB tables.

    PITR provides continuous backups of your DynamoDB table data. PITR can be enabled through your Terraform deployment, the AWS console, or the AWS API. When enabled, DynamoDB maintains continuous backups of your table up to the last 35 days. You can recover a copy of that table to a previous state at any point in time from the moment you enable PITR, up to a maximum of the 35 preceding days. PITR provides continuous backups until you explicitly disable it.

    On-demand backups allow you to create backups of DynamoDB table data and its settings. You can initiate an on-demand backup at any time with a single click from the AWS Management Console or a single API call. You can restore the backups to a new DynamoDB table in the same AWS Region at any time.

    PITR gives your DynamoDB tables continuous protection from accidental writes and deletes. With PITR, you do not have to worry about creating, maintaining, or scheduling backups. You enable PITR on your table and your backup is available for restore at any point in time from the moment you enable it, up to a maximum of the 35 preceding days. For example, imagine a test script writing accidentally to a production DynamoDB table. You could recover your table to any point in time within the last 35 days.

    On-demand backups help with long-term archival requirements for regulatory compliance. On-demand backups give you full-control of managing the lifecycle of your backups, from creating as many backups as you need to retaining these for as long as you need.

    Enabling PITR during deployment

    By default, the Cumulus data-persistence module enables PITR on the default tables listed in the module's variable defaults for enable_point_in_time_tables. At the time of writing, that list includes:

    • AsyncOperationsTable
    • CollectionsTable
    • ExecutionsTable
    • FilesTable
    • GranulesTable
    • PdrsTable
    • ProvidersTable
    • RulesTable

    If you wish to change this list, simply update your deployment's data_persistence module (here in the template-deploy repository) to pass the correct list of tables.

    Restoring with PITR

    Restoring a full deployment

    If your deployment has been deleted all of your tables with PITR enabled will have had backups created automatically. You can locate these backups in the AWS console in the DynamoDb Backups Page or through the CLI by running:

    aws dynamodb list-backups --backup-type SYSTEM

    You can restore your tables to your AWS account using the following command:

    aws dynamodb restore-table-from-backup --target-table-name <prefix>-CollectionsTable --backup-arn <backup-arn>

    Where prefix matches the prefix from your data-persistence deployment. backup-arn can be found in the AWS console or by listing the backups using the command above.

    This will restore your tables to AWS. They will need to be linked to your Terraform deployment. After terraform init and before terraform apply, run the following command for each table:

    terraform import module.data_persistence.aws_dynamodb_table.collections_table <prefix>-CollectionsTable

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Terraform will now manage these tables as part of the Terraform state. Run terrform apply to generate the rest of the data-persistence deployment and then follow the instructions to deploy the cumulus deployment as normal.

    At this point the data will be in DynamoDB, but not in Elasticsearch, so nothing will be returned on the Operator dashboard or through Operator API calls. To get the data into Elasticsearch, run an index-from-database operation via the Operator API. The status of this operation can be viewed on the dashboard. When Elasticsearch is switched to the recovery index the data will be visible on the dashboard and available via the Operator API.

    Restoring an individual table

    A table can be restored to a previous state using PITR. This is easily achievable via the AWS Console by visiting the Backups tab for the table.

    A table can only be recovered to a new table name. Following the restoration of the table, the new table must be imported into Terraform.

    First, remove the old table from the Terraform state:

    terraform state rm module.data_persistence.aws_dynamodb_table.collections_table

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Then import the new table into the Terraform state:

    terraform import module.data_persistence.aws_dynamodb_table.collections_table <new-table-name>

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Your data-persistence and cumulus deployments should be redeployed so that your instance of Cumulus uses this new table. After the deployment, your Elasticsearch instance will be out of sync with your new table if there is any change in data. To resync your Elasticsearch with your database run an index-from-database operation via the Operator API. The status of this operation can be viewed on the dashboard. When Elasticsearch is switched to the new index the DynamoDB tables and Elasticsearch instance will be in sync and the correct data will be reflected on the dashboard.

    Backup and Restore with cumulus-api CLI

    cumulus-api CLI also includes a backup and restore command. The CLI backup command downloads the content of any of your DynamoDB tables to .json files. You can also use these .json files to restore the records to another DynamoDB table.

    Backup with the CLI

    To backup a table with the CLI, install the @cumulus/api package using npm, making sure to install the same version as your Cumulus deployment:

    npm install -g @cumulus/api@version

    Then run:

    cumulus-api backup --table <table-name>

    the backup will be stored at backups/<table-name>.json

    Restore with the CLI

    To restore data from a json file run the following command:

    cumulus-api restore backups/<table-name>.json --table <table-name>

    The restore can go to the in-use table and will update Elasticsearch. If an existing record exists in the table it will not be duplicated but will be updated with the record from the restore file.

    Data Backup and Restore

    Cumulus provides no core functionality to backup data stored in S3. Data disaster recovery is being developed in a separate effort here.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/data_in_dynamodb/index.html b/docs/v10.0.0/features/data_in_dynamodb/index.html index bcce6fa215f..7a5a64865fa 100644 --- a/docs/v10.0.0/features/data_in_dynamodb/index.html +++ b/docs/v10.0.0/features/data_in_dynamodb/index.html @@ -5,13 +5,13 @@ Cumulus Metadata in DynamoDB | Cumulus Documentation - +
    Version: v10.0.0

    Cumulus Metadata in DynamoDB

    @cumulus/api uses a number of methods to preserve the metadata generated in a Cumulus instance.

    All configurations and system-generated metadata is stored in DynamoDB tables except the logs. System logs are stored in the AWS CloudWatch service.

    Amazon DynamoDB stores three geographically distributed replicas of each table to enable high availability and data durability. Amazon DynamoDB runs exclusively on solid-state drives (SSDs). SSDs help AWS achieve the design goals of predictable low-latency response times for storing and accessing data at any scale.

    DynamoDB Auto Scaling

    Cumulus deployed tables from the data-persistence module are set to on-demand mode.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/dead_letter_archive/index.html b/docs/v10.0.0/features/dead_letter_archive/index.html index f9fa8f617b4..0ae21f8a295 100644 --- a/docs/v10.0.0/features/dead_letter_archive/index.html +++ b/docs/v10.0.0/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: v10.0.0

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/dead_letter_queues/index.html b/docs/v10.0.0/features/dead_letter_queues/index.html index 4d583ffa911..15f29545053 100644 --- a/docs/v10.0.0/features/dead_letter_queues/index.html +++ b/docs/v10.0.0/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v10.0.0

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch based on DynamoDB events)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/distribution-metrics/index.html b/docs/v10.0.0/features/distribution-metrics/index.html index 1bfe66d29b2..27275305a90 100644 --- a/docs/v10.0.0/features/distribution-metrics/index.html +++ b/docs/v10.0.0/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v10.0.0

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/execution_payload_retention/index.html b/docs/v10.0.0/features/execution_payload_retention/index.html index 27eca12eb76..33e0a83f282 100644 --- a/docs/v10.0.0/features/execution_payload_retention/index.html +++ b/docs/v10.0.0/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v10.0.0

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in DynamoDB and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/logging-esdis-metrics/index.html b/docs/v10.0.0/features/logging-esdis-metrics/index.html index 5afe4b8e13a..677664d9b4d 100644 --- a/docs/v10.0.0/features/logging-esdis-metrics/index.html +++ b/docs/v10.0.0/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v10.0.0

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/replay-archived-sqs-messages/index.html b/docs/v10.0.0/features/replay-archived-sqs-messages/index.html index c5919b127ee..76e04517a0b 100644 --- a/docs/v10.0.0/features/replay-archived-sqs-messages/index.html +++ b/docs/v10.0.0/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: v10.0.0

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/replay-kinesis-messages/index.html b/docs/v10.0.0/features/replay-kinesis-messages/index.html index 7175951aebf..ab4a8b7db72 100644 --- a/docs/v10.0.0/features/replay-kinesis-messages/index.html +++ b/docs/v10.0.0/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v10.0.0

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/features/reports/index.html b/docs/v10.0.0/features/reports/index.html index a949d281ffb..856af0b737b 100644 --- a/docs/v10.0.0/features/reports/index.html +++ b/docs/v10.0.0/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -19,7 +19,7 @@ report generation. The data buckets will include any buckets in your Cumulus buckets configuration that have type public, protected or private.
    - + \ No newline at end of file diff --git a/docs/v10.0.0/getting-started/index.html b/docs/v10.0.0/getting-started/index.html index b33557553bf..b09228ea8cd 100644 --- a/docs/v10.0.0/getting-started/index.html +++ b/docs/v10.0.0/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v10.0.0

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Deploy Cumulus and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    2. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    3. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment. For an introduction about Terraform go here.
    • Integrator Common Use Cases: Scenarios to help integrators along in the Cumulus environment.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/v10.0.0/glossary/index.html b/docs/v10.0.0/glossary/index.html index 9cb948e3d64..ed8c92e83d4 100644 --- a/docs/v10.0.0/glossary/index.html +++ b/docs/v10.0.0/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: v10.0.0

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: aws.amazon.com

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see cookbook reference page.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Npm

    Node package manager.

    For more information, see npmjs.com.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (NPM)

    NPM hosted node.js packages. Cumulus packages can be found on NPM's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data and more.

    For more information, see AWS's s3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/index.html b/docs/v10.0.0/index.html index c5411955e41..29a10f39bbb 100644 --- a/docs/v10.0.0/index.html +++ b/docs/v10.0.0/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v10.0.0

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/integrator-guide/about-int-guide/index.html b/docs/v10.0.0/integrator-guide/about-int-guide/index.html index 0e93410877f..438404ae040 100644 --- a/docs/v10.0.0/integrator-guide/about-int-guide/index.html +++ b/docs/v10.0.0/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v10.0.0

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/v10.0.0/integrator-guide/int-common-use-cases/index.html b/docs/v10.0.0/integrator-guide/int-common-use-cases/index.html index c538a96ad90..19d2ad7f22a 100644 --- a/docs/v10.0.0/integrator-guide/int-common-use-cases/index.html +++ b/docs/v10.0.0/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v10.0.0/integrator-guide/workflow-add-new-lambda/index.html b/docs/v10.0.0/integrator-guide/workflow-add-new-lambda/index.html index 29fae082903..17d6af4d199 100644 --- a/docs/v10.0.0/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/v10.0.0/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v10.0.0

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/integrator-guide/workflow-ts-failed-step/index.html b/docs/v10.0.0/integrator-guide/workflow-ts-failed-step/index.html index 74514f3e23b..b8cb10e07e2 100644 --- a/docs/v10.0.0/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/v10.0.0/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v10.0.0

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/v10.0.0/interfaces/index.html b/docs/v10.0.0/interfaces/index.html index 078563fccaf..e4841e1e940 100644 --- a/docs/v10.0.0/interfaces/index.html +++ b/docs/v10.0.0/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v10.0.0

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Note: This diagram is current of v1.18.0.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/about-operator-docs/index.html b/docs/v10.0.0/operator-docs/about-operator-docs/index.html index 9442bf13293..e0118b1f22b 100644 --- a/docs/v10.0.0/operator-docs/about-operator-docs/index.html +++ b/docs/v10.0.0/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v10.0.0

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/bulk-operations/index.html b/docs/v10.0.0/operator-docs/bulk-operations/index.html index 2efc4372b3a..1f6995883ee 100644 --- a/docs/v10.0.0/operator-docs/bulk-operations/index.html +++ b/docs/v10.0.0/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v10.0.0

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/cmr-operations/index.html b/docs/v10.0.0/operator-docs/cmr-operations/index.html index 7b06a849ff3..43727e138e2 100644 --- a/docs/v10.0.0/operator-docs/cmr-operations/index.html +++ b/docs/v10.0.0/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/create-rule-in-cumulus/index.html b/docs/v10.0.0/operator-docs/create-rule-in-cumulus/index.html index 62eb3965d3e..2bd7f41c82b 100644 --- a/docs/v10.0.0/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/v10.0.0/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v10.0.0

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/discovery-filtering/index.html b/docs/v10.0.0/operator-docs/discovery-filtering/index.html index 9cc45dd2435..db52919bdda 100644 --- a/docs/v10.0.0/operator-docs/discovery-filtering/index.html +++ b/docs/v10.0.0/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/granule-workflows/index.html b/docs/v10.0.0/operator-docs/granule-workflows/index.html index 79168917cd7..5fad209d9bc 100644 --- a/docs/v10.0.0/operator-docs/granule-workflows/index.html +++ b/docs/v10.0.0/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v10.0.0

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/kinesis-stream-for-ingest/index.html b/docs/v10.0.0/operator-docs/kinesis-stream-for-ingest/index.html index d5b9fd19abe..3070bbe764a 100644 --- a/docs/v10.0.0/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/v10.0.0/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v10.0.0

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/locating-access-logs/index.html b/docs/v10.0.0/operator-docs/locating-access-logs/index.html index d3dd241baa6..286c05d4adb 100644 --- a/docs/v10.0.0/operator-docs/locating-access-logs/index.html +++ b/docs/v10.0.0/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v10.0.0

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/naming-executions/index.html b/docs/v10.0.0/operator-docs/naming-executions/index.html index 052878ceaef..d2de246b27e 100644 --- a/docs/v10.0.0/operator-docs/naming-executions/index.html +++ b/docs/v10.0.0/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Please note: This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/ops-common-use-cases/index.html b/docs/v10.0.0/operator-docs/ops-common-use-cases/index.html index 426fce3986a..cf27172700b 100644 --- a/docs/v10.0.0/operator-docs/ops-common-use-cases/index.html +++ b/docs/v10.0.0/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v10.0.0/operator-docs/trigger-workflow/index.html b/docs/v10.0.0/operator-docs/trigger-workflow/index.html index 880fe6fc857..18b321282e2 100644 --- a/docs/v10.0.0/operator-docs/trigger-workflow/index.html +++ b/docs/v10.0.0/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v10.0.0

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/tasks/index.html b/docs/v10.0.0/tasks/index.html index 68b79be4fd6..eee90d9bcb9 100644 --- a/docs/v10.0.0/tasks/index.html +++ b/docs/v10.0.0/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v10.0.0

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/v10.0.0/team/index.html b/docs/v10.0.0/team/index.html index c822cafd167..59ae2bb7274 100644 --- a/docs/v10.0.0/team/index.html +++ b/docs/v10.0.0/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v10.0.0/troubleshooting/index.html b/docs/v10.0.0/troubleshooting/index.html index 05ee0d4a7ba..5f5b1c05b50 100644 --- a/docs/v10.0.0/troubleshooting/index.html +++ b/docs/v10.0.0/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v10.0.0

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/troubleshooting/reindex-elasticsearch/index.html b/docs/v10.0.0/troubleshooting/reindex-elasticsearch/index.html index bfdece0ec04..e9c3b2225e9 100644 --- a/docs/v10.0.0/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/v10.0.0/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/troubleshooting/rerunning-workflow-executions/index.html b/docs/v10.0.0/troubleshooting/rerunning-workflow-executions/index.html index 7086c02da44..f9ff4e26786 100644 --- a/docs/v10.0.0/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/v10.0.0/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v10.0.0

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/v10.0.0/troubleshooting/troubleshooting-deployment/index.html b/docs/v10.0.0/troubleshooting/troubleshooting-deployment/index.html index abb32a0110a..b2e066c5ee0 100644 --- a/docs/v10.0.0/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/v10.0.0/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/v10.0.0/upgrade-notes/cumulus_distribution_migration/index.html b/docs/v10.0.0/upgrade-notes/cumulus_distribution_migration/index.html index 8e4753d65e9..966bdf484e7 100644 --- a/docs/v10.0.0/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/v10.0.0/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: v10.0.0

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/upgrade-notes/migrate_tea_standalone/index.html b/docs/v10.0.0/upgrade-notes/migrate_tea_standalone/index.html index a0e15721530..b5cda946ad7 100644 --- a/docs/v10.0.0/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/v10.0.0/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v10.0.0

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/upgrade-notes/update-cma-2.0.2/index.html b/docs/v10.0.0/upgrade-notes/update-cma-2.0.2/index.html index f6d12d11047..933a4eb1686 100644 --- a/docs/v10.0.0/upgrade-notes/update-cma-2.0.2/index.html +++ b/docs/v10.0.0/upgrade-notes/update-cma-2.0.2/index.html @@ -5,13 +5,13 @@ Upgrade to CMA 2.0.2 | Cumulus Documentation - +
    Version: v10.0.0

    Upgrade to CMA 2.0.2

    Updating a Cumulus Deployment to CMA 2.0.2

    Background

    The Cumulus Message Adapter has been updated in release 2.0.2 to no longer utilize the AWS step function API to look up the defined name of a step function task for population in meta.workflow_tasks, but instead use an incrementing integer field.

    Additionally a bugfix was released in the form of v2.0.1/v2.0.2 following the initial 2.0.0 release, so all users should update to release 2.0.2

    The update is not tied to a particular version of Core, however the update should be done across all task components in order to ensure consistent execution records.

    Changes

    Execution Record Update

    This update functionally means that Cumulus tasks/activities using the CMA will now record a record that looks like the following in meta.workflowtasks, and more importantly in the tasks column for an execution record:

    Original

          "DiscoverGranules": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "QueueGranules": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    New

          "0": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "1": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    Actions Required

    The following should be done as part of a Cumulus stack update to utilize cumulus message adapter > 2.0.2:

    • Python tasks that utilize cumulus-message-adapter-python should be updated to use > 2.0.0, their lambdas rebuilt and Cumulus workflows reconfigured to use the updated version.

    • Python activities that utilize cumulus-process-py should be rebuilt using > 1.0.0 with updated dependencies, and have their images deployed/Cumulus configured to use the new version.

    • The cumulus-message-adapter v2.0.2 lambda layer should be made available in the deployment account, and the Cumulus deployment should be reconfigured to use it (via the cumulus_message_adapter_lambda_layer_version_arn variable in the cumulus module). This should address all Core node.js tasks that utilize the CMA, and many contributed node.js/JAVA components.

    Once the above have been done, redeploy Cumulus to apply the configuration and the updates should be live.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/upgrade-notes/update-task-file-schemas/index.html b/docs/v10.0.0/upgrade-notes/update-task-file-schemas/index.html index 0cbfbcac3bb..3e7a224cb1a 100644 --- a/docs/v10.0.0/upgrade-notes/update-task-file-schemas/index.html +++ b/docs/v10.0.0/upgrade-notes/update-task-file-schemas/index.html @@ -5,13 +5,13 @@ Updates to task granule file schemas | Cumulus Documentation - +
    Version: v10.0.0

    Updates to task granule file schemas

    Background

    Most Cumulus workflow tasks expect as input a payload of granule(s) which contain the files for each granule. Most tasks also return this same granule structure as output.

    However, up to this point, there was inconsistency in the schemas for the granule files objects expected by each task. Furthermore, there was no guarantee of consistency between granule files objects as stored in the database and the expectations of any given workflow task.

    Thus, when performing bulk granule operations which pass granules from the database into a Cumulus workflow, it was possible for there to be schema validation failures depending on which task was used to start the workflow and its particular schema.

    In order to rectify this situation, CUMULUS-2388 was filed and addressed to create a common granule files schema between nearly all of the Cumulus tasks (exceptions discussed below) and the Cumulus database. The following documentation explains the manual changes you need to make to your deployment in order to be compatible with the updated files schema.

    Updated files schema

    The updated granule files schema can be found here.

    These former properties were deprecated (with notes about how to derive the same information from the updated schema, if possible):

    • filename - concatenate the bucket and key values with a directory separator (/)
    • name - use fileName property
    • etag - ETags are no longer provided as an individual file property. Instead, a separate etags object mapping S3 URIs to ETag values is provided as output from the following workflow tasks (guidance on how to integrate this output with your workflows is provided in the Upgrading your workflows section below):
      • update-granules-cmr-metadata-file-links
      • hyrax-metadata-updates
    • fileStagingDir - no longer supported
    • url_path - no longer supported
    • duplicate_found - This property is no longer supported, however sync-granule and move-granules now produce a separate granuleDuplicates object as part of their output. The granuleDuplicates object is a map of granules by granule ID which includes the files that encountered duplicates during processing. Guidance on how to integrate granuleDuplicates information into your workflow configuration is provided below.

    Exceptions

    These workflow tasks did not have their schema for granule files updated:

    • discover-granules - no updates
    • queue-granules - no updates
    • parse-pdr - no updates
    • sync-granule - input schema not updated, output schema was updated

    The reason that these task schemas were not updated is that all of these tasks start before the files have been ingested to S3, thus much of the information that is required in the updated files schema like bucket, key, or checksum is not yet known.

    Bulk granule operations

    Since the input schema for the above tasks was not updated, that means you cannot run bulk granule operations against workflows if they start with any of those tasks. Bulk granule operations work by loading the specified granules from the database and sending them as input to a specified workflow, so if the specified workflow begins with a task whose input schema does not conform to what is coming out of the database, there will be schema errors.

    Upgrading your deployment

    Upgrading your workflows

    For any workflows using the update-granules-cmr-metadata-file-links task before the hyrax-metadata-updates and/or post-to-cmr tasks, update the step definition for update-granules-cmr-metadata-file-links as follows:

        "UpdateGranulesCmrMetadataFileLinksStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    hyrax-metadata-updates

    For any workflows using the hyrax-metadata-updates task before a post-to-cmr task, update the definition of the hyrax-metadata-updates step as follows:

        "HyraxMetadataUpdatesTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    post-to-cmr

    For any workflows using post-to-cmr task after the update-granules-cmr-metadata-file-links or hyrax-metadata-updates tasks, update the post-to-cmr step definition as follows:

        "CmrStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}"
    }
    }
    },
    ...more configuration...

    Example workflow

    For an example workflow integrating all of these changes, please see our example ingest and publish workflow.

    Optional - Integrate granuleDuplicates information

    Please note that the granuleDuplicates output is purely informational and does not have any bearing on the separate configuration for how duplicates should be handled.

    You can include granuleDuplicates output from the sync-granule or move-granules tasks in your workflow messages like so:

        "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    ...other config...
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granuleDuplicates}",
    "destination": "{$.meta.sync_granule.granule_duplicates}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    }
    ...more configuration...

    The result of this configuration is that the granuleDuplicates output from sync-granule would be placed in meta.sync_granule.granule_duplicates on the workflow message and remain there throughout the rest of the workflow. The same configuration could be replicated for the move-granules task, but be sure to use a different destination in the workflow message for the granuleDuplicates output .

    Updating collection URL path templates

    Collections can specify url_path templates to dynamically generate the final location of files. As part of url_path templates, file object properties can be interpolated to generate the file path. Thus, these url_path templates need to be updated to ensure that they are compatible with the updated files schema and the properties that will actually be available on file objects.

    See the notes on the updated files schema to know which properties are available and which previously existing properties were deprecated.

    As an example, you will want to update any url_path properties in your collections to remove references to file.name and replace them with references to file.fileName like so:

    - "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.name, 0, 3)}",
    + "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.fileName, 0, 3)}",
    - + \ No newline at end of file diff --git a/docs/v10.0.0/upgrade-notes/upgrade-rds/index.html b/docs/v10.0.0/upgrade-notes/upgrade-rds/index.html index 576eee321d8..f8299262caa 100644 --- a/docs/v10.0.0/upgrade-notes/upgrade-rds/index.html +++ b/docs/v10.0.0/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/v10.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/v10.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html index 5e135bacbd1..f7507605716 100644 --- a/docs/v10.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/v10.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v10.0.0

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the upgrade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflow_tasks/discover_granules/index.html b/docs/v10.0.0/workflow_tasks/discover_granules/index.html index 339ef6d0bdb..a7974e5e893 100644 --- a/docs/v10.0.0/workflow_tasks/discover_granules/index.html +++ b/docs/v10.0.0/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflow_tasks/files_to_granules/index.html b/docs/v10.0.0/workflow_tasks/files_to_granules/index.html index c9091176649..5a83a516a51 100644 --- a/docs/v10.0.0/workflow_tasks/files_to_granules/index.html +++ b/docs/v10.0.0/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v10.0.0

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • size
    • bucket
    • key
    • fileName

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflow_tasks/lzards_backup/index.html b/docs/v10.0.0/workflow_tasks/lzards_backup/index.html index c4783cdd9ac..25a8978521c 100644 --- a/docs/v10.0.0/workflow_tasks/lzards_backup/index.html +++ b/docs/v10.0.0/workflow_tasks/lzards_backup/index.html @@ -5,13 +5,13 @@ LZARDS Backup | Cumulus Documentation - +
    Version: v10.0.0

    LZARDS Backup

    The LZARDS backup task takes an array of granules and initiates backup requests to the LZARDS API, which will be handled asynchronously by LZARDS.

    Deployment

    The LZARDS backup task is not automatically deployed with Cumulus. To deploy the task through the Cumulus module, first you must specify a lzards_launchpad_passphrase in your terraform variables (e.g. variables.tf) like so:

    variable "lzards_launchpad_passphrase" {
    type = string
    default = ""
    }

    Then you can specify a value for your lzards_launchpad_passphrase in terraform.tfvars like so:

    lzards_launchpad_passphrase = your-passphrase

    Lastly, you need to make sure that the lzards_launchpad_passphrase is passed into the Cumulus module (in main.tf) like so:

    lzards_launchpad_passphrase  = var.lzards_launchpad_passphrase

    In short, deploying the LZARDS task requires configuring a passphrase variable and ensuring that your TF configuration passes that variable into the Cumulus module.

    Additional terraform configuration for the LZARDS task can be found in the cumulus module's variables.tf file, where the the relevant variables are prefixed with lzards_. You can add these variables to your deployment using the same process outlined above for lzards_launchpad_passphrase.

    Task Inputs

    Input

    This task expects an array of granules as input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Task Outputs

    Output

    The LZARDS task outputs a composite object containing:

    • the input granules array, and
    • a backupResults object that describes the results of LZARDS backup attempts.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflow_tasks/move_granules/index.html b/docs/v10.0.0/workflow_tasks/move_granules/index.html index 53712ba85f5..8ff9fe08ef8 100644 --- a/docs/v10.0.0/workflow_tasks/move_granules/index.html +++ b/docs/v10.0.0/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v10.0.0

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflow_tasks/parse_pdr/index.html b/docs/v10.0.0/workflow_tasks/parse_pdr/index.html index 2e7999f56f3..74651bff557 100644 --- a/docs/v10.0.0/workflow_tasks/parse_pdr/index.html +++ b/docs/v10.0.0/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v10.0.0

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflow_tasks/queue_granules/index.html b/docs/v10.0.0/workflow_tasks/queue_granules/index.html index 3d32d877fae..159f870eacd 100644 --- a/docs/v10.0.0/workflow_tasks/queue_granules/index.html +++ b/docs/v10.0.0/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: v10.0.0

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflows/cumulus-task-message-flow/index.html b/docs/v10.0.0/workflows/cumulus-task-message-flow/index.html index 900cae36c41..b2718cf55da 100644 --- a/docs/v10.0.0/workflows/cumulus-task-message-flow/index.html +++ b/docs/v10.0.0/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v10.0.0

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflows/developing-a-cumulus-workflow/index.html b/docs/v10.0.0/workflows/developing-a-cumulus-workflow/index.html index d4dda738625..44e39caa01a 100644 --- a/docs/v10.0.0/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/v10.0.0/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v10.0.0

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflows/developing-workflow-tasks/index.html b/docs/v10.0.0/workflows/developing-workflow-tasks/index.html index d582087b3eb..71e08651d24 100644 --- a/docs/v10.0.0/workflows/developing-workflow-tasks/index.html +++ b/docs/v10.0.0/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v10.0.0

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflows/docker/index.html b/docs/v10.0.0/workflows/docker/index.html index 895de0adca1..4df1b943b68 100644 --- a/docs/v10.0.0/workflows/docker/index.html +++ b/docs/v10.0.0/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflows/index.html b/docs/v10.0.0/workflows/index.html index 597be318c1c..12f97ec666b 100644 --- a/docs/v10.0.0/workflows/index.html +++ b/docs/v10.0.0/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v10.0.0

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflows/input_output/index.html b/docs/v10.0.0/workflows/input_output/index.html index 2cc4859e9c7..eaf0bcff5e3 100644 --- a/docs/v10.0.0/workflows/input_output/index.html +++ b/docs/v10.0.0/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v10.0.0

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflows/lambda/index.html b/docs/v10.0.0/workflows/lambda/index.html index 5a583f43816..64f64a0ce52 100644 --- a/docs/v10.0.0/workflows/lambda/index.html +++ b/docs/v10.0.0/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v10.0.0

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflows/protocol/index.html b/docs/v10.0.0/workflows/protocol/index.html index f2331074e53..46ae882be0b 100644 --- a/docs/v10.0.0/workflows/protocol/index.html +++ b/docs/v10.0.0/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v10.0.0

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflows/workflow-configuration-how-to/index.html b/docs/v10.0.0/workflows/workflow-configuration-how-to/index.html index 12a16c9e7c2..faf4e77865a 100644 --- a/docs/v10.0.0/workflows/workflow-configuration-how-to/index.html +++ b/docs/v10.0.0/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -24,7 +24,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.fileName, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/v10.0.0/workflows/workflow-triggers/index.html b/docs/v10.0.0/workflows/workflow-triggers/index.html index fc33eae4647..2a0774fa1b8 100644 --- a/docs/v10.0.0/workflows/workflow-triggers/index.html +++ b/docs/v10.0.0/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v10.0.0

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/v10.1.0/adding-a-task/index.html b/docs/v10.1.0/adding-a-task/index.html index 8295f894c27..0336618058d 100644 --- a/docs/v10.1.0/adding-a-task/index.html +++ b/docs/v10.1.0/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: v10.1.0

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/api/index.html b/docs/v10.1.0/api/index.html index 850533a6d4b..0071fad3945 100644 --- a/docs/v10.1.0/api/index.html +++ b/docs/v10.1.0/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v10.1.0/architecture/index.html b/docs/v10.1.0/architecture/index.html index 17c3f0a084a..6e916532040 100644 --- a/docs/v10.1.0/architecture/index.html +++ b/docs/v10.1.0/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: v10.1.0

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a set of DynamoDB database tables, and is exported to an ElasticSearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Database

    Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance. For legacy reasons, Cumulus is currently also deploying a DynamoDB datastore, and writes are replicated in parallel with the PostgreSQL database writes. The DynamoDB replicated tables and parallel writes will be removed in future releases.

    PostgreSQL Database Schema Diagram

    ERD of the Cumulus Database

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/v10.1.0/configuration/cloudwatch-retention/index.html b/docs/v10.1.0/configuration/cloudwatch-retention/index.html index 80a93d7d49a..dd615fa17d6 100644 --- a/docs/v10.1.0/configuration/cloudwatch-retention/index.html +++ b/docs/v10.1.0/configuration/cloudwatch-retention/index.html @@ -5,13 +5,13 @@ Cloudwatch Retention | Cumulus Documentation - +
    Version: v10.1.0

    Cloudwatch Retention

    Our lambdas dump logs to AWS CloudWatch. By default, these logs exist indefinitely. However, there are ways to specify a duration for log retention.

    aws-cli

    In addition to getting your aws-cli set-up, there are two values you'll need to acquire.

    1. log-group-name: the name of the log group who's retention policy (retention time) you'd like to change. We'll use /aws/lambda/KinesisInboundLogger in our examples.
    2. retention-in-days: the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    For example, if we wanted to set log retention to 30 days on our KinesisInboundLogger lambda, we would write:

    aws logs put-retention-policy --log-group-name "/aws/lambda/KinesisInboundLogger" --retention-in-days 30

    Note: The aws-cli log command that we're using is explained in detail here.

    AWS Management Console

    Changing the log retention policy in the AWS Management Console is a fairly simple process:

    1. Navigate to the CloudWatch service in the AWS Management Console.
    2. Click on the Logs entry on the sidebar.
    3. Find the Log Group who's retention policy you're interested in changing.
    4. Click on the value in the Expire Events After column.
    5. Enter/Select the number of days you'd like to retain logs in that log group for.

    Screenshot of AWS console showing how to configure the retention period for Cloudwatch logs

    - + \ No newline at end of file diff --git a/docs/v10.1.0/configuration/collection-storage-best-practices/index.html b/docs/v10.1.0/configuration/collection-storage-best-practices/index.html index a741b2de997..d547c9597d4 100644 --- a/docs/v10.1.0/configuration/collection-storage-best-practices/index.html +++ b/docs/v10.1.0/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: v10.1.0

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/v10.1.0/configuration/data-management-types/index.html b/docs/v10.1.0/configuration/data-management-types/index.html index 60d79d1658f..21035db1a51 100644 --- a/docs/v10.1.0/configuration/data-management-types/index.html +++ b/docs/v10.1.0/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: v10.1.0

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    Please note:

    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certificateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certiciateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22
    privateKeystringNofilename assumed to be in s3://bucketInternal/stackName/crypto
    cmKeyIdstringNoAWS KMS Customer Master Key arn or alias

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.fileName, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.fileName, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v10.1.0/configuration/lifecycle-policies/index.html b/docs/v10.1.0/configuration/lifecycle-policies/index.html index 8d41c7f0207..30f07f26bac 100644 --- a/docs/v10.1.0/configuration/lifecycle-policies/index.html +++ b/docs/v10.1.0/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: v10.1.0

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walk-through on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/v10.1.0/configuration/monitoring-readme/index.html b/docs/v10.1.0/configuration/monitoring-readme/index.html index 56d5d993522..0c5efb5def8 100644 --- a/docs/v10.1.0/configuration/monitoring-readme/index.html +++ b/docs/v10.1.0/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: v10.1.0

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return await Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/configuration/server_access_logging/index.html b/docs/v10.1.0/configuration/server_access_logging/index.html index 696a8eb6351..8589646bd60 100644 --- a/docs/v10.1.0/configuration/server_access_logging/index.html +++ b/docs/v10.1.0/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: v10.1.0

    S3 Server Access Logging

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/v10.1.0/configuration/task-configuration/index.html b/docs/v10.1.0/configuration/task-configuration/index.html index 4662a168f12..39770b1cf4d 100644 --- a/docs/v10.1.0/configuration/task-configuration/index.html +++ b/docs/v10.1.0/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
    Version: v10.1.0

    Configuration of Tasks

    The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

    cmr_search_client_config

    Configuration parameters for CMR search client for cumulus archive module tasks in the form:

    <lambda_identifier>_report_cmr_limit = <maximum number records can be returned from cmr-client search, this should be greater than cmr_page_size>
    <lambda_identifier>_report_cmr_page_size = <number of records for each page returned from CMR>
    type = map(string)

    More information about cmr limit and cmr page_size can be found from @cumulus/cmr-client and CMR Search API document.

    Currently the following values are supported:

    • create_reconciliation_report_cmr_limit
    • create_reconciliation_report_cmr_page_size

    Example

    cmr_search_client_config = {
    create_reconciliation_report_cmr_limit = 2500
    create_reconciliation_report_cmr_page_size = 250
    }

    elasticsearch_client_config

    Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

    <lambda_identifier>_es_scroll_duration = <duration>
    <lambda_identifier>_es_scroll_size = <size>
    type = map(string)

    Currently the following values are supported:

    • create_reconciliation_report_es_scroll_duration
    • create_reconciliation_report_es_scroll_size

    Example

    elasticsearch_client_config = {
    create_reconciliation_report_es_scroll_duration = "15m"
    create_reconciliation_report_es_scroll_size = 2000
    }

    lambda_timeouts

    A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_timeout: <timeout>
    type = map(string)

    Currently the following values are supported:

    • discover_granules_task_timeout
    • discover_pdrs_task_timeout
    • hyrax_metadata_update_tasks_timeout
    • lzards_backup_task_timeout
    • move_granules_task_timeout
    • parse_pdr_task_timeout
    • pdr_status_check_task_timeout
    • post_to_cmr_task_timeout
    • queue_granules_task_timeout
    • queue_pdrs_task_timeout
    • queue_workflow_task_timeout
    • sync_granule_task_timeout
    • update_granules_cmr_metadata_file_links_task_timeout

    Example

    lambda_timeouts = {
    discover_granules_task_timeout = 300
    }
    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/about-cookbooks/index.html b/docs/v10.1.0/data-cookbooks/about-cookbooks/index.html index 779eb2451a8..8deb1c5750f 100644 --- a/docs/v10.1.0/data-cookbooks/about-cookbooks/index.html +++ b/docs/v10.1.0/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: v10.1.0

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to configure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/browse-generation/index.html b/docs/v10.1.0/data-cookbooks/browse-generation/index.html index 7250ecfdabe..e6e624dd254 100644 --- a/docs/v10.1.0/data-cookbooks/browse-generation/index.html +++ b/docs/v10.1.0/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg"
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/choice-states/index.html b/docs/v10.1.0/data-cookbooks/choice-states/index.html index fb665fe5de8..709fae60ef8 100644 --- a/docs/v10.1.0/data-cookbooks/choice-states/index.html +++ b/docs/v10.1.0/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: v10.1.0

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/cnm-workflow/index.html b/docs/v10.1.0/data-cookbooks/cnm-workflow/index.html index 89c7f5108aa..744ba9ebcc3 100644 --- a/docs/v10.1.0/data-cookbooks/cnm-workflow/index.html +++ b/docs/v10.1.0/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v10.1.0

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/kinesis_trigger_test_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }

    Please Note:

    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

    Please note: The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/error-handling/index.html b/docs/v10.1.0/data-cookbooks/error-handling/index.html index a588190ad33..e22d851134e 100644 --- a/docs/v10.1.0/data-cookbooks/error-handling/index.html +++ b/docs/v10.1.0/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/hello-world/index.html b/docs/v10.1.0/data-cookbooks/hello-world/index.html index d8bd250ae05..4f2bf6c8019 100644 --- a/docs/v10.1.0/data-cookbooks/hello-world/index.html +++ b/docs/v10.1.0/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: v10.1.0

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/ingest-notifications/index.html b/docs/v10.1.0/data-cookbooks/ingest-notifications/index.html index 483ffe1e214..75752e5e2e5 100644 --- a/docs/v10.1.0/data-cookbooks/ingest-notifications/index.html +++ b/docs/v10.1.0/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: v10.1.0

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics are created and used for handling notification messages related to the workflow.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates DynamoDB. The DynamoDB events for the ExecutionsTable, GranulesTable and PdrsTable are streamed on DynamoDBStreams, which are read by the publishExecutions, publishGranules and publishPdrs Lambda functions, respectively.

    These Lambda functions publish to the three SNS topics both when the workflow starts and when it reaches a terminal state (completion or failure). The following describes how many message(s) each topic receives both on workflow start and workflow completion/failure:

    • reportExecutions - Receives 1 message per workflow execution
    • reportGranules - Receives 1 message per granule in a workflow execution
    • reportPdrs - Receives 1 message per PDR

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/queue-post-to-cmr/index.html b/docs/v10.1.0/data-cookbooks/queue-post-to-cmr/index.html index 57ba5f5f6e4..969463bf67a 100644 --- a/docs/v10.1.0/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/v10.1.0/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: v10.1.0

    Queue PostToCmr

    In this document, we walk through handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/v10.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index 74080cebd29..d561115772e 100644 --- a/docs/v10.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/v10.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: v10.1.0

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.7.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn
    ]
    alarms = {
    MemoryUtilizationHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 75
    }
    }
    }

    Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/sips-workflow/index.html b/docs/v10.1.0/data-cookbooks/sips-workflow/index.html index d82c2137292..4cf25451edb 100644 --- a/docs/v10.1.0/data-cookbooks/sips-workflow/index.html +++ b/docs/v10.1.0/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/throttling-queued-executions/index.html b/docs/v10.1.0/data-cookbooks/throttling-queued-executions/index.html index 482a1eacc79..8ee6fdd097f 100644 --- a/docs/v10.1.0/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/v10.1.0/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: v10.1.0

    Throttling queued executions

    In this entry, we will walk through how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/v10.1.0/data-cookbooks/tracking-files/index.html b/docs/v10.1.0/data-cookbooks/tracking-files/index.html index 08fa0ba3635..c412ddd5f51 100644 --- a/docs/v10.1.0/data-cookbooks/tracking-files/index.html +++ b/docs/v10.1.0/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/api-gateway-logging/index.html b/docs/v10.1.0/deployment/api-gateway-logging/index.html index 4a36e3e81f4..1dd820112eb 100644 --- a/docs/v10.1.0/deployment/api-gateway-logging/index.html +++ b/docs/v10.1.0/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: v10.1.0

    API Gateway Logging

    Enabling API Gateway logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions for enabling account level logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    Create a policy document

    The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

    Save this snippet as apigateway-policy.json.

    {
    "Version": "2012-10-17",
    "Statement": [
    {
    "Sid": "",
    "Effect": "Allow",
    "Principal": {
    "Service": "apigateway.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
    }
    ]
    }

    Create an account role to act as ApiGateway and write to CloudWatchLogs

    NASA users in NGAP: be sure to use your account's permission boundary.

    aws iam create-role \
    --role-name ApiGatewayToCloudWatchLogs \
    [--permissions-boundary <permissionBoundaryArn>] \
    --assume-role-policy-document file://apigateway-policy.json

    Note the ARN of the returned role for the last step.

    Attach correct permissions to role

    Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

    aws iam attach-role-policy \
    --role-name ApiGatewayToCloudWatchLogs \
    --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"

    Update Account API Gateway settings with correct permissions

    Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

    aws apigateway update-account \
    --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    See Configure Cloudwatch Logs Delivery

    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/cloudwatch-logs-delivery/index.html b/docs/v10.1.0/deployment/cloudwatch-logs-delivery/index.html index 4ca1309495a..87bd8b645e5 100644 --- a/docs/v10.1.0/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/v10.1.0/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: v10.1.0

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    Be default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/components/index.html b/docs/v10.1.0/deployment/components/index.html index 27f8c9a6d5d..0fcc2d4b25f 100644 --- a/docs/v10.1.0/deployment/components/index.html +++ b/docs/v10.1.0/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walk-through of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/create_bucket/index.html b/docs/v10.1.0/deployment/create_bucket/index.html index 00382eefdb6..6d0fca46cbd 100644 --- a/docs/v10.1.0/deployment/create_bucket/index.html +++ b/docs/v10.1.0/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: v10.1.0

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command line

    Using the AWS command line tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }

    Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web interface

    See: AWS "Creating a Bucket" documentation

    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/cumulus_distribution/index.html b/docs/v10.1.0/deployment/cumulus_distribution/index.html index 531c9d3eaea..db83681c8d8 100644 --- a/docs/v10.1.0/deployment/cumulus_distribution/index.html +++ b/docs/v10.1.0/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
    Version: v10.1.0

    Using the Cumulus Distribution API

    The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

    Configuring a Cumulus Distribution deployment

    The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

    These steps assume you're using the Cumulus Deployment Template but can also be used for custom deployments.

    To configure a deployment to use Cumulus Distribution:

    1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the Cumulus Distribution settings.
    2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
    3. Uncomment the Cumulus Distribution outputs in outputs.tf.
    4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

    Cognito Application and User Credentials

    The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

    Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

    Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

    • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
    • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
    • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

    Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

    Cumulus Distribution URL

    Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

    By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

    In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

    You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file, and setting it to one of the following values (both of which are explained below):

    1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
    2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

    The following subsections explain these approaches, in turn.

    Using your Cumulus Distribution API Gateway URL as your distribution URL

    Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development), as described here. Here is an outline of the required steps, with details provided further below:

    1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
    2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
    3. Choose an open local port on your machine (we'll use 9000 in the following details)
    4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
    5. Redeploy Cumulus
    6. Add an entry to your /etc/hosts file
    7. Add a redirect URI to Cognito, via the Cognito API
    8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
    9. Add a sample file to S3 to test downloading via Cognito

    To create or import an existing key pair, you can use the AWS CLI (see aws ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

    Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

    key_name = "<name>"
    cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

    where:

    • <name> is the name of the key pair you just added to AWS
    • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
    • <port> is your open local port of choice (9000 is typically a good choice)

    Once you save your variable changes, redeploy your cumulus-tf module.

    While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

    localhost <hostname>

    Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

    • method: POST
    • base URL: the value of your csdap_host_url Terraform variable
    • path: /authclient/updateRedirectUri
    • username: the value of your csdap_client_id Terraform variable
    • password: the value of your csdap_client_password Terraform variable
    • headers: Content-Type='application/x-www-form-urlencoded'
    • body: redirect_uri=<cumulus_distribution_url>/login

    where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

    For reference, see the Cognito Authentication Service API.

    Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

    brew install --cask session-manager-plugin --no-quarantine

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

    1. Determine your ec2 instance ID
    2. Connect to the NASA VPN
    3. Start an AWS SSM session
    4. Open an ssh tunnel
    5. Use a browser to navigate to your file

    To determine your ec2 instance ID for your Cumulus deployment, run the follow command, where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

    aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text

    IMPORTANT: Before proceeding with the remaining steps, make sure you're connected to the NASA VPN.

    Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

    aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

    If successful, you should see output similar to the following:

    Starting session with SessionId: NGAPShApplicationDeveloper-***
    Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
    Waiting for connections...

    Open another terminal window, and open a tunnel with port forwarding, using your chosen port from above (e.g., 9000):

    ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

    where:

    • <port> is the open local port you chose earlier (e.g., 9000)
    • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    Once you're finished testing, clean up as follows:

    1. Kill your ssh tunnel (Ctrl-C)
    2. Kill your AWS SSM session (Ctrl-C)
    3. If you like, disconnect from the NASA VPC

    While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

    Using a CloudFront URL as your distribution URL

    In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

    This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

    • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
    • A request to add the AWS account's VPC to the whitelist

    Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

    cumulus_distribution_url = <cloudfront_url>

    In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

    At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

    Redeploy Cumulus with your new/updated Terraform variables.

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    S3 Bucket Mapping

    An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

    The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Note: Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Switching from the Thin Egress App to Cumulus Distribution

    If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

    Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

    There will also be downtime while your API gateway is updated.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/index.html b/docs/v10.1.0/deployment/index.html index c351e9bcda1..cb822f156fa 100644 --- a/docs/v10.1.0/deployment/index.html +++ b/docs/v10.1.0/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -21,7 +21,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Choose a distribution API

    Cumulus can be configured to use either the Thin Egress App (TEA) or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

    IMPORTANT! If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    Configure the Thin Egress App

    The Thin Egress App can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the documentation to configure distribution in your cumulus-tf deployment.

    Configure the Cumulus Distribution API (optional)

    If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

    Note: Be sure to copy the redirect URLs, as you will use them to update your Earthdata application.

    Update Earthdata Application

    You will need to add two redirect URLs to your EarthData login application.

    1. Login to URS.
    2. Under My Applications -> Application Administration -> use the edit icon of your application.
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token.
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1.
    5. You may delete the placeholder url you used to create the application.

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus dashboard

    Dashboard Requirements

    Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the dashboard repository.

    Prepare AWS

    Create S3 bucket for dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install dashboard

    To install the dashboard, clone the Cumulus dashboard repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard versioning

    By default, the master branch will be used for dashboard deployments. The master branch of the dashboard repo contains the most recent stable release of the dashboard.

    If you want to test unreleased changes to the dashboard, use the develop branch.

    Each release/version of the dashboard will have a tag in the dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the dashboard

    Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build the dashboard from the dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard deployment

    Deploy dashboard to s3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard --acl public-read

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and login with a user that you configured for access in the Configure and Deploy the Cumulus Stack step.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 instances and autoscaling groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/postgres_database_deployment/index.html b/docs/v10.1.0/deployment/postgres_database_deployment/index.html index 8e7e589eaf3..8f0cef77950 100644 --- a/docs/v10.1.0/deployment/postgres_database_deployment/index.html +++ b/docs/v10.1.0/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 10.2 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases, and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs, and the Aurora Serverless V1 docs.

    Prepare deployment repository

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a new repository

    Create a new repository on Github so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

    --

    Configure and deploy the module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc. This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision user and user database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true, this configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to
    • prefix -- this value will be used to set a unique identifier the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda, and run it on each provision, creating the configured database if it does not exist, updating the user password if that value has been changed, and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    Please Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see output like:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secret required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are is in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Next Steps

    The database cluster has been created/updated! From here you can continue to add additional user accounts, databases and other database configuration.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/share-s3-access-logs/index.html b/docs/v10.1.0/deployment/share-s3-access-logs/index.html index 056b838b97d..f1c42faf3be 100644 --- a/docs/v10.1.0/deployment/share-s3-access-logs/index.html +++ b/docs/v10.1.0/deployment/share-s3-access-logs/index.html @@ -5,14 +5,14 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: v10.1.0

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a node package that contains a simple lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First ensure that you have enabled S3 Server Access Logging.

    Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition. e.g.

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The terraform source package can be found on the Cumulus github release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 replicator as described above using the target_bucket and target_prefix provided by the metrics team.

    The metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/terraform-best-practices/index.html b/docs/v10.1.0/deployment/terraform-best-practices/index.html index bce10622d12..62dbd3ed83a 100644 --- a/docs/v10.1.0/deployment/terraform-best-practices/index.html +++ b/docs/v10.1.0/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    Configuring the Cumulus deployment: link Restoring a previous version: link

    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/thin_egress_app/index.html b/docs/v10.1.0/deployment/thin_egress_app/index.html index a3c6fc66af0..94ba7a90e68 100644 --- a/docs/v10.1.0/deployment/thin_egress_app/index.html +++ b/docs/v10.1.0/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v10.1.0

    Using the Thin Egress App for Cumulus distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a secret for signing Thin Egress App JWTs

    The Thin Egress App uses JWTs internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Please note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally configure a custom bucket map

    A simple config would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public

    Please note: your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally configure shared variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus core example deployment code.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/deployment/upgrade-readme/index.html b/docs/v10.1.0/deployment/upgrade-readme/index.html index 83610ccd549..b804f487c2f 100644 --- a/docs/v10.1.0/deployment/upgrade-readme/index.html +++ b/docs/v10.1.0/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/development/forked-pr/index.html b/docs/v10.1.0/development/forked-pr/index.html index 67a630d2ef4..cf43c9a9577 100644 --- a/docs/v10.1.0/development/forked-pr/index.html +++ b/docs/v10.1.0/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: v10.1.0

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/development/integration-tests/index.html b/docs/v10.1.0/development/integration-tests/index.html index 8b6269ea653..6bb06760dd8 100644 --- a/docs/v10.1.0/development/integration-tests/index.html +++ b/docs/v10.1.0/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/development/quality-and-coverage/index.html b/docs/v10.1.0/development/quality-and-coverage/index.html index ec509a6d846..40e6d411f29 100644 --- a/docs/v10.1.0/development/quality-and-coverage/index.html +++ b/docs/v10.1.0/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/development/release/index.html b/docs/v10.1.0/development/release/index.html index f2657ef2254..25ecc7b64d1 100644 --- a/docs/v10.1.0/development/release/index.html +++ b/docs/v10.1.0/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -15,7 +15,7 @@ It's useful to use the search feature of your code editor or grep to see if there any references to the old package versions. In bash shell you can run

    find . -name package.json -exec grep -nH "@cumulus/.*MAJOR\.MINOR\.PATCH.*" {} \;

    Verify that each of those is updated to the new MAJOR.MINOR.PATCH verion you are trying to release.

    A similar search for alpha and beta versions should be run on the release version and any problems should be fixed.

    find . -name package.json -exec grep -nHE "MAJOR\.MINOR\.PATCH.*(alpha|beta)" {} \;

    3. Check Cumulus Dashboard PRs for Version Bump

    There may be unreleased changes in the Cumulus Dashboard project that rely on this unreleased Cumulus Core version.

    If there is exists a PR in the cumulus-dashboard repo with a name containing: "Version Bump for Next Cumulus API Release":

    • There will be a placeholder change-me value that should be replaced with the Cumulus Core to-be-released-version.
    • Mark that PR as ready to be reviewed.

    4. Update CHANGELOG.md

    Update the CHANGELOG.md. Put a header under the Unreleased section with the new version number and the date.

    Add a link reference for the github "compare" view at the bottom of the CHANGELOG.md, following the existing pattern. This link reference should create a link in the CHANGELOG's release header to changes in the corresponding release.

    5. Update DATA_MODEL_CHANGELOG.md

    Similar to #4, make sure the DATA_MODEL_CHANGELOG is updated if there are data model changes in the release, and the link reference at the end of the document is updated as appropriate.

    6. Update CONTRIBUTORS.md

    ./bin/update-contributors.sh
    git add CONTRIBUTORS.md

    Commit and push these changes, if any.

    7. Update Cumulus package API documentation

    Update auto-generated API documentation for any Cumulus packages that have it:

    npm run docs-build-packages

    Commit and push these changes, if any.

    8. Cut new version of Cumulus Documentation

    If this is a backport, do not create a new version of the documentation. For various reasons, we do not merge backports back to master, other than changelog notes. Documentation changes for backports will not be published to our documentation website.

    cd website
    npm run version ${release_version}
    git add .

    Where ${release_version} corresponds to the version tag v1.2.3, for example.

    Commit and push these changes.

    9. Create a pull request against the minor version branch

    1. Push the release branch (e.g. release-1.2.3) to GitHub.

    2. Create a PR against the minor version base branch (e.g. release-1.2.x).

    3. Configure Bamboo to run automated tests against this PR by finding the branch plan for the release branch (release-1.2.3) and setting only these variables:

      • GIT_PR: true
      • SKIP_AUDIT: true

      IMPORTANT: Do NOT set the PUBLISH_FLAG variable to true for this branch plan. The actual publishing of the release will be handled by a separate, manually triggered branch plan.

      Screenshot of Bamboo CI interface showing the configuration of the GIT_PR branch variable to have a value of &quot;true&quot;

    4. Verify that the Bamboo build for the PR succeeds and then merge to the minor version base branch (release-1.2.x).

      • It is safe to do a squash merge in this instance, but not required
    5. You may delete your release branch (release-1.2.3) after merging to the base branch.

    10. Create a git tag for the release

    Check out the minor version base branch (release-1.2.x) now that your changes are merged in and do a git pull.

    Ensure you are on the latest commit.

    Create and push a new git tag:

        git tag -a vMAJOR.MINOR.PATCH -m "Release MAJOR.MINOR.PATCH"
    git push origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -a v9.1.0 -m "Release 9.1.0"
    git push origin v9.1.0

    11. Publishing the release

    Publishing of new releases is handled by a custom Bamboo branch plan and is manually triggered.

    The reasons for using a separate branch plan to handle releases instead of the branch plan for the minor version (e.g. release-1.2.x) are:

    • The Bamboo build for the minor version release branch is triggered automatically on any commits to that branch, whereas we want to manually control when the release is published.
    • We want to verify that integration tests have passed on the Bamboo build for the minor version release branch before we manually trigger the release, so that we can be sure that our code is safe to release.

    If this is a new minor version branch, then you will need to create a new Bamboo branch plan for publishing the release following the instructions below:

    Creating a Bamboo branch plan for the release

    • In the Cumulus Core project (https://ci.earthdata.nasa.gov/browse/CUM-CBA), click Actions -> Configure Plan in the top right.

    • Next to Plan branch click the rightmost button that displays Create Plan Branch upon hover.

    • Click Create plan branch manually.

    • Add the values in that list. Choose a display name that makes it very clear this is a deployment branch plan. Release (minor version branch name) seems to work well (e.g. Release (1.2.x))).

      • Make sure you enter the correct branch name (e.g. release-1.2.x).
    • Important Deselect Enable Branch - if you do not do this, it will immediately fire off a build.

    • Do Immediately On the Branch Details page, enable Change trigger. Set the Trigger type to manual, this will prevent commits to the branch from triggering the build plan. You should have been redirected to the Branch Details tab after creating the plan. If not, navigate to the branch from the list where you clicked Create Plan Branch in the previous step.

    • Go to the Variables tab. Ensure that you are on your branch plan and not the master plan: You should not see a large list of configured variables, but instead a dropdown allowing you to select variables to override, and the tab title will be Branch Variables. Then set the branch variables as follow:

      • DEPLOYMENT: cumulus-from-npm-tf (except in special cases such as incompatible backport branches)
        • If this variable is not set, it will default to the deployment name for the last committer on the branch
      • USE_CACHED_BOOTSTRAP: false
      • USE_TERRAFORM_ZIPS: true (IMPORTANT: MUST be set in order to run integration tests against the .zip files published during the build so that we are actually testing our released files)
      • GIT_PR: true
      • SKIP_AUDIT: true
      • PUBLISH_FLAG: true
    • Enable the branch from the Branch Details page.

    • Run the branch using the Run button in the top right.

    Bamboo will build and run lint and unit tests against that tagged release, publish the new packages to NPM, and then run the integration tests using those newly released packages.

    12. Create a new Cumulus release on github

    The CI release scripts will automatically create a GitHub release based on the release version tag, as well as upload artifacts to the Github release for the Terraform modules provided by Cumulus. The Terraform release artifacts include:

    • A multi-module Terraform .zip artifact containing filtered copies of the tf-modules, packages, and tasks directories for use as Terraform module sources.
    • A S3 replicator module
    • A workflow module
    • A distribution API module
    • An ECS service module

    Just make sure to verify the appropriate .zip files are present on Github after the release process is complete.

    13. Merge base branch back to master

    Finally, you need to reproduce the version update changes back to master.

    If this is the latest version, you can simply create a PR to merge the minor version base branch back to master.

    Do not merge master back into the release branch since we want the release branch to just have the code from the release. Instead, create a new branch off of the release branch and merge that to master. You can freely merge master into this branch and delete it when it is merged to master.

    If this is a backport, you will need to create a PR that ports the changelog updates back to master. It is important in this changelog note to call it out as a backport. For example, fixes in backport version 1.14.5 may not be available in 1.15.0 because the fix was introduced in 1.15.3.

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d vMAJOR.MINOR.PATCH
    git push -d origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -d v9.1.0
    git push -d origin v9.1.0
    - + \ No newline at end of file diff --git a/docs/v10.1.0/docs-how-to/index.html b/docs/v10.1.0/docs-how-to/index.html index f1f68c910bb..4db89cb2e37 100644 --- a/docs/v10.1.0/docs-how-to/index.html +++ b/docs/v10.1.0/docs-how-to/index.html @@ -5,13 +5,13 @@ Cumulus Documentation: How To's | Cumulus Documentation - +
    Version: v10.1.0

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve

    Note: docs-build will build the documents into website/build.

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---

    Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walk-through can be found here. It is worth noting that we would like the Documentation versions to match up directly with release versions. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiKey and an indexName by DocSearch that we include in our website/siteConfig.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for both these values to exist - DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/external-contributions/index.html b/docs/v10.1.0/external-contributions/index.html index 01395a4e5ba..261dc26383a 100644 --- a/docs/v10.1.0/external-contributions/index.html +++ b/docs/v10.1.0/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: v10.1.0

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/faqs/index.html b/docs/v10.1.0/faqs/index.html index 0ebe88c1deb..324dfc8b86a 100644 --- a/docs/v10.1.0/faqs/index.html +++ b/docs/v10.1.0/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: v10.1.0

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    What prerequisites are needed to setup Cumulus?

    Answer: You will need access to the AWS console and an Earthdata login before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a JIRA issue.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    Is Cumulus customizable and flexible?

    Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

    What are Terraform modules?

    Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

    Where do I find Terraform module variables

    Answer: Go here for a list of Cumulus maintained variables.

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting here.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/ancillary_metadata/index.html b/docs/v10.1.0/features/ancillary_metadata/index.html index 54bc78fb6af..77df283f46c 100644 --- a/docs/v10.1.0/features/ancillary_metadata/index.html +++ b/docs/v10.1.0/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v10.1.0

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/backup_and_restore/index.html b/docs/v10.1.0/features/backup_and_restore/index.html index 1c464a6543e..56fff6ba9ae 100644 --- a/docs/v10.1.0/features/backup_and_restore/index.html +++ b/docs/v10.1.0/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -71,7 +71,7 @@ utilize the new cluster/security groups and redeploy.

    DynamoDB

    Backup and Restore with AWS

    You can enable point-in-time recovery (PITR) as well as create an on-demand backup for your Amazon DynamoDB tables.

    PITR provides continuous backups of your DynamoDB table data. PITR can be enabled through your Terraform deployment, the AWS console, or the AWS API. When enabled, DynamoDB maintains continuous backups of your table up to the last 35 days. You can recover a copy of that table to a previous state at any point in time from the moment you enable PITR, up to a maximum of the 35 preceding days. PITR provides continuous backups until you explicitly disable it.

    On-demand backups allow you to create backups of DynamoDB table data and its settings. You can initiate an on-demand backup at any time with a single click from the AWS Management Console or a single API call. You can restore the backups to a new DynamoDB table in the same AWS Region at any time.

    PITR gives your DynamoDB tables continuous protection from accidental writes and deletes. With PITR, you do not have to worry about creating, maintaining, or scheduling backups. You enable PITR on your table and your backup is available for restore at any point in time from the moment you enable it, up to a maximum of the 35 preceding days. For example, imagine a test script writing accidentally to a production DynamoDB table. You could recover your table to any point in time within the last 35 days.

    On-demand backups help with long-term archival requirements for regulatory compliance. On-demand backups give you full-control of managing the lifecycle of your backups, from creating as many backups as you need to retaining these for as long as you need.

    Enabling PITR during deployment

    By default, the Cumulus data-persistence module enables PITR on the default tables listed in the module's variable defaults for enable_point_in_time_tables. At the time of writing, that list includes:

    • AsyncOperationsTable
    • CollectionsTable
    • ExecutionsTable
    • FilesTable
    • GranulesTable
    • PdrsTable
    • ProvidersTable
    • RulesTable

    If you wish to change this list, simply update your deployment's data_persistence module (here in the template-deploy repository) to pass the correct list of tables.

    Restoring with PITR

    Restoring a full deployment

    If your deployment has been deleted all of your tables with PITR enabled will have had backups created automatically. You can locate these backups in the AWS console in the DynamoDb Backups Page or through the CLI by running:

    aws dynamodb list-backups --backup-type SYSTEM

    You can restore your tables to your AWS account using the following command:

    aws dynamodb restore-table-from-backup --target-table-name <prefix>-CollectionsTable --backup-arn <backup-arn>

    Where prefix matches the prefix from your data-persistence deployment. backup-arn can be found in the AWS console or by listing the backups using the command above.

    This will restore your tables to AWS. They will need to be linked to your Terraform deployment. After terraform init and before terraform apply, run the following command for each table:

    terraform import module.data_persistence.aws_dynamodb_table.collections_table <prefix>-CollectionsTable

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Terraform will now manage these tables as part of the Terraform state. Run terrform apply to generate the rest of the data-persistence deployment and then follow the instructions to deploy the cumulus deployment as normal.

    At this point the data will be in DynamoDB, but not in Elasticsearch, so nothing will be returned on the Operator dashboard or through Operator API calls. To get the data into Elasticsearch, run an index-from-database operation via the Operator API. The status of this operation can be viewed on the dashboard. When Elasticsearch is switched to the recovery index the data will be visible on the dashboard and available via the Operator API.

    Restoring an individual table

    A table can be restored to a previous state using PITR. This is easily achievable via the AWS Console by visiting the Backups tab for the table.

    A table can only be recovered to a new table name. Following the restoration of the table, the new table must be imported into Terraform.

    First, remove the old table from the Terraform state:

    terraform state rm module.data_persistence.aws_dynamodb_table.collections_table

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Then import the new table into the Terraform state:

    terraform import module.data_persistence.aws_dynamodb_table.collections_table <new-table-name>

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Your data-persistence and cumulus deployments should be redeployed so that your instance of Cumulus uses this new table. After the deployment, your Elasticsearch instance will be out of sync with your new table if there is any change in data. To resync your Elasticsearch with your database run an index-from-database operation via the Operator API. The status of this operation can be viewed on the dashboard. When Elasticsearch is switched to the new index the DynamoDB tables and Elasticsearch instance will be in sync and the correct data will be reflected on the dashboard.

    Backup and Restore with cumulus-api CLI

    cumulus-api CLI also includes a backup and restore command. The CLI backup command downloads the content of any of your DynamoDB tables to .json files. You can also use these .json files to restore the records to another DynamoDB table.

    Backup with the CLI

    To backup a table with the CLI, install the @cumulus/api package using npm, making sure to install the same version as your Cumulus deployment:

    npm install -g @cumulus/api@version

    Then run:

    cumulus-api backup --table <table-name>

    the backup will be stored at backups/<table-name>.json

    Restore with the CLI

    To restore data from a json file run the following command:

    cumulus-api restore backups/<table-name>.json --table <table-name>

    The restore can go to the in-use table and will update Elasticsearch. If an existing record exists in the table it will not be duplicated but will be updated with the record from the restore file.

    Data Backup and Restore

    Cumulus provides no core functionality to backup data stored in S3. Data disaster recovery is being developed in a separate effort here.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/data_in_dynamodb/index.html b/docs/v10.1.0/features/data_in_dynamodb/index.html index 696dea37ff0..efd69b91a40 100644 --- a/docs/v10.1.0/features/data_in_dynamodb/index.html +++ b/docs/v10.1.0/features/data_in_dynamodb/index.html @@ -5,13 +5,13 @@ Cumulus Metadata in DynamoDB | Cumulus Documentation - +
    Version: v10.1.0

    Cumulus Metadata in DynamoDB

    @cumulus/api uses a number of methods to preserve the metadata generated in a Cumulus instance.

    All configurations and system-generated metadata is stored in DynamoDB tables except the logs. System logs are stored in the AWS CloudWatch service.

    Amazon DynamoDB stores three geographically distributed replicas of each table to enable high availability and data durability. Amazon DynamoDB runs exclusively on solid-state drives (SSDs). SSDs help AWS achieve the design goals of predictable low-latency response times for storing and accessing data at any scale.

    DynamoDB Auto Scaling

    Cumulus deployed tables from the data-persistence module are set to on-demand mode.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/dead_letter_archive/index.html b/docs/v10.1.0/features/dead_letter_archive/index.html index af8abd290e5..ee42e0fefdb 100644 --- a/docs/v10.1.0/features/dead_letter_archive/index.html +++ b/docs/v10.1.0/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: v10.1.0

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/dead_letter_queues/index.html b/docs/v10.1.0/features/dead_letter_queues/index.html index 08b38f81460..a05b16a4a1a 100644 --- a/docs/v10.1.0/features/dead_letter_queues/index.html +++ b/docs/v10.1.0/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v10.1.0

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch based on DynamoDB events)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/distribution-metrics/index.html b/docs/v10.1.0/features/distribution-metrics/index.html index 20ec319f08a..fca5ad59b23 100644 --- a/docs/v10.1.0/features/distribution-metrics/index.html +++ b/docs/v10.1.0/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v10.1.0

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/execution_payload_retention/index.html b/docs/v10.1.0/features/execution_payload_retention/index.html index 25297359569..a75dfbeb9a5 100644 --- a/docs/v10.1.0/features/execution_payload_retention/index.html +++ b/docs/v10.1.0/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v10.1.0

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in DynamoDB and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/logging-esdis-metrics/index.html b/docs/v10.1.0/features/logging-esdis-metrics/index.html index 71ce01a1a49..6968b83dfa6 100644 --- a/docs/v10.1.0/features/logging-esdis-metrics/index.html +++ b/docs/v10.1.0/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v10.1.0

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/replay-archived-sqs-messages/index.html b/docs/v10.1.0/features/replay-archived-sqs-messages/index.html index 0ba2d6f5983..b8ced35209a 100644 --- a/docs/v10.1.0/features/replay-archived-sqs-messages/index.html +++ b/docs/v10.1.0/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: v10.1.0

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/replay-kinesis-messages/index.html b/docs/v10.1.0/features/replay-kinesis-messages/index.html index c79705ab29e..42a1829e435 100644 --- a/docs/v10.1.0/features/replay-kinesis-messages/index.html +++ b/docs/v10.1.0/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v10.1.0

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/features/reports/index.html b/docs/v10.1.0/features/reports/index.html index f6e35a6800f..d6335e90d6d 100644 --- a/docs/v10.1.0/features/reports/index.html +++ b/docs/v10.1.0/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -19,7 +19,7 @@ report generation. The data buckets will include any buckets in your Cumulus buckets configuration that have type public, protected or private.
    - + \ No newline at end of file diff --git a/docs/v10.1.0/getting-started/index.html b/docs/v10.1.0/getting-started/index.html index 6a1b3c50784..6cb53e86434 100644 --- a/docs/v10.1.0/getting-started/index.html +++ b/docs/v10.1.0/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v10.1.0

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Deploy Cumulus and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    2. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    3. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment. For an introduction about Terraform go here.
    • Integrator Common Use Cases: Scenarios to help integrators along in the Cumulus environment.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/v10.1.0/glossary/index.html b/docs/v10.1.0/glossary/index.html index 6912882a317..beedd54cb63 100644 --- a/docs/v10.1.0/glossary/index.html +++ b/docs/v10.1.0/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: v10.1.0

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: aws.amazon.com

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see cookbook reference page.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Npm

    Node package manager.

    For more information, see npmjs.com.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (NPM)

    NPM hosted node.js packages. Cumulus packages can be found on NPM's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data and more.

    For more information, see AWS's s3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/index.html b/docs/v10.1.0/index.html index 3f1230dacd2..642461eaaa9 100644 --- a/docs/v10.1.0/index.html +++ b/docs/v10.1.0/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v10.1.0

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/integrator-guide/about-int-guide/index.html b/docs/v10.1.0/integrator-guide/about-int-guide/index.html index eae1a5090df..52ea26ada29 100644 --- a/docs/v10.1.0/integrator-guide/about-int-guide/index.html +++ b/docs/v10.1.0/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v10.1.0

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/v10.1.0/integrator-guide/int-common-use-cases/index.html b/docs/v10.1.0/integrator-guide/int-common-use-cases/index.html index 0263b7e83cb..353243ff6b6 100644 --- a/docs/v10.1.0/integrator-guide/int-common-use-cases/index.html +++ b/docs/v10.1.0/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v10.1.0/integrator-guide/workflow-add-new-lambda/index.html b/docs/v10.1.0/integrator-guide/workflow-add-new-lambda/index.html index 928f703bf89..d8e376bbaee 100644 --- a/docs/v10.1.0/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/v10.1.0/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v10.1.0

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/integrator-guide/workflow-ts-failed-step/index.html b/docs/v10.1.0/integrator-guide/workflow-ts-failed-step/index.html index 40dbae9d92a..574fccfe4a0 100644 --- a/docs/v10.1.0/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/v10.1.0/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v10.1.0

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/v10.1.0/interfaces/index.html b/docs/v10.1.0/interfaces/index.html index 3064c55f2b2..367bc74bb78 100644 --- a/docs/v10.1.0/interfaces/index.html +++ b/docs/v10.1.0/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v10.1.0

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Note: This diagram is current of v1.18.0.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/about-operator-docs/index.html b/docs/v10.1.0/operator-docs/about-operator-docs/index.html index a4c7f6c7002..0779bf25d95 100644 --- a/docs/v10.1.0/operator-docs/about-operator-docs/index.html +++ b/docs/v10.1.0/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v10.1.0

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/bulk-operations/index.html b/docs/v10.1.0/operator-docs/bulk-operations/index.html index e23c8d49f9c..754409cd193 100644 --- a/docs/v10.1.0/operator-docs/bulk-operations/index.html +++ b/docs/v10.1.0/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v10.1.0

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/cmr-operations/index.html b/docs/v10.1.0/operator-docs/cmr-operations/index.html index 20056001f93..446547f1ee3 100644 --- a/docs/v10.1.0/operator-docs/cmr-operations/index.html +++ b/docs/v10.1.0/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/create-rule-in-cumulus/index.html b/docs/v10.1.0/operator-docs/create-rule-in-cumulus/index.html index 55cab686a99..48b69daf29c 100644 --- a/docs/v10.1.0/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/v10.1.0/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v10.1.0

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/discovery-filtering/index.html b/docs/v10.1.0/operator-docs/discovery-filtering/index.html index 5875d4cf959..b07bb769869 100644 --- a/docs/v10.1.0/operator-docs/discovery-filtering/index.html +++ b/docs/v10.1.0/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/granule-workflows/index.html b/docs/v10.1.0/operator-docs/granule-workflows/index.html index 7c8d640bfd7..d6d5027ce7c 100644 --- a/docs/v10.1.0/operator-docs/granule-workflows/index.html +++ b/docs/v10.1.0/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v10.1.0

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/kinesis-stream-for-ingest/index.html b/docs/v10.1.0/operator-docs/kinesis-stream-for-ingest/index.html index 10cec638fda..c63637ea383 100644 --- a/docs/v10.1.0/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/v10.1.0/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v10.1.0

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/locating-access-logs/index.html b/docs/v10.1.0/operator-docs/locating-access-logs/index.html index b03dc9cdede..6556477dfdc 100644 --- a/docs/v10.1.0/operator-docs/locating-access-logs/index.html +++ b/docs/v10.1.0/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v10.1.0

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/naming-executions/index.html b/docs/v10.1.0/operator-docs/naming-executions/index.html index 706162999d4..7a3e1456fb3 100644 --- a/docs/v10.1.0/operator-docs/naming-executions/index.html +++ b/docs/v10.1.0/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Please note: This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/ops-common-use-cases/index.html b/docs/v10.1.0/operator-docs/ops-common-use-cases/index.html index 34de4246664..baf0e4e5437 100644 --- a/docs/v10.1.0/operator-docs/ops-common-use-cases/index.html +++ b/docs/v10.1.0/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v10.1.0/operator-docs/trigger-workflow/index.html b/docs/v10.1.0/operator-docs/trigger-workflow/index.html index 952826af2d1..74853b6d24e 100644 --- a/docs/v10.1.0/operator-docs/trigger-workflow/index.html +++ b/docs/v10.1.0/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v10.1.0

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/tasks/index.html b/docs/v10.1.0/tasks/index.html index b4834781905..dfffdbc2e45 100644 --- a/docs/v10.1.0/tasks/index.html +++ b/docs/v10.1.0/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v10.1.0

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/v10.1.0/team/index.html b/docs/v10.1.0/team/index.html index 8a3d38c946e..006bfe11903 100644 --- a/docs/v10.1.0/team/index.html +++ b/docs/v10.1.0/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v10.1.0/troubleshooting/index.html b/docs/v10.1.0/troubleshooting/index.html index 1967deb7e5e..9a27a9e4fa4 100644 --- a/docs/v10.1.0/troubleshooting/index.html +++ b/docs/v10.1.0/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v10.1.0

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/troubleshooting/reindex-elasticsearch/index.html b/docs/v10.1.0/troubleshooting/reindex-elasticsearch/index.html index 9b02dd01747..93174f9fcd9 100644 --- a/docs/v10.1.0/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/v10.1.0/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/troubleshooting/rerunning-workflow-executions/index.html b/docs/v10.1.0/troubleshooting/rerunning-workflow-executions/index.html index ce773b4fa4a..488a405d0f6 100644 --- a/docs/v10.1.0/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/v10.1.0/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v10.1.0

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/v10.1.0/troubleshooting/troubleshooting-deployment/index.html b/docs/v10.1.0/troubleshooting/troubleshooting-deployment/index.html index f5882462ff8..a8043733227 100644 --- a/docs/v10.1.0/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/v10.1.0/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/v10.1.0/upgrade-notes/cumulus_distribution_migration/index.html b/docs/v10.1.0/upgrade-notes/cumulus_distribution_migration/index.html index 71cbb8ba991..c4ee5db199d 100644 --- a/docs/v10.1.0/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/v10.1.0/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: v10.1.0

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/upgrade-notes/migrate_tea_standalone/index.html b/docs/v10.1.0/upgrade-notes/migrate_tea_standalone/index.html index 322f705dcc0..b4ea6314973 100644 --- a/docs/v10.1.0/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/v10.1.0/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v10.1.0

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/upgrade-notes/update-cma-2.0.2/index.html b/docs/v10.1.0/upgrade-notes/update-cma-2.0.2/index.html index b167e10c7e4..f02c624da4b 100644 --- a/docs/v10.1.0/upgrade-notes/update-cma-2.0.2/index.html +++ b/docs/v10.1.0/upgrade-notes/update-cma-2.0.2/index.html @@ -5,13 +5,13 @@ Upgrade to CMA 2.0.2 | Cumulus Documentation - +
    Version: v10.1.0

    Upgrade to CMA 2.0.2

    Updating a Cumulus Deployment to CMA 2.0.2

    Background

    The Cumulus Message Adapter has been updated in release 2.0.2 to no longer utilize the AWS step function API to look up the defined name of a step function task for population in meta.workflow_tasks, but instead use an incrementing integer field.

    Additionally a bugfix was released in the form of v2.0.1/v2.0.2 following the initial 2.0.0 release, so all users should update to release 2.0.2

    The update is not tied to a particular version of Core, however the update should be done across all task components in order to ensure consistent execution records.

    Changes

    Execution Record Update

    This update functionally means that Cumulus tasks/activities using the CMA will now record a record that looks like the following in meta.workflowtasks, and more importantly in the tasks column for an execution record:

    Original

          "DiscoverGranules": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "QueueGranules": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    New

          "0": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "1": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    Actions Required

    The following should be done as part of a Cumulus stack update to utilize cumulus message adapter > 2.0.2:

    • Python tasks that utilize cumulus-message-adapter-python should be updated to use > 2.0.0, their lambdas rebuilt and Cumulus workflows reconfigured to use the updated version.

    • Python activities that utilize cumulus-process-py should be rebuilt using > 1.0.0 with updated dependencies, and have their images deployed/Cumulus configured to use the new version.

    • The cumulus-message-adapter v2.0.2 lambda layer should be made available in the deployment account, and the Cumulus deployment should be reconfigured to use it (via the cumulus_message_adapter_lambda_layer_version_arn variable in the cumulus module). This should address all Core node.js tasks that utilize the CMA, and many contributed node.js/JAVA components.

    Once the above have been done, redeploy Cumulus to apply the configuration and the updates should be live.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/upgrade-notes/update-task-file-schemas/index.html b/docs/v10.1.0/upgrade-notes/update-task-file-schemas/index.html index 03c421f647b..1b405940ec7 100644 --- a/docs/v10.1.0/upgrade-notes/update-task-file-schemas/index.html +++ b/docs/v10.1.0/upgrade-notes/update-task-file-schemas/index.html @@ -5,13 +5,13 @@ Updates to task granule file schemas | Cumulus Documentation - +
    Version: v10.1.0

    Updates to task granule file schemas

    Background

    Most Cumulus workflow tasks expect as input a payload of granule(s) which contain the files for each granule. Most tasks also return this same granule structure as output.

    However, up to this point, there was inconsistency in the schemas for the granule files objects expected by each task. Furthermore, there was no guarantee of consistency between granule files objects as stored in the database and the expectations of any given workflow task.

    Thus, when performing bulk granule operations which pass granules from the database into a Cumulus workflow, it was possible for there to be schema validation failures depending on which task was used to start the workflow and its particular schema.

    In order to rectify this situation, CUMULUS-2388 was filed and addressed to create a common granule files schema between nearly all of the Cumulus tasks (exceptions discussed below) and the Cumulus database. The following documentation explains the manual changes you need to make to your deployment in order to be compatible with the updated files schema.

    Updated files schema

    The updated granule files schema can be found here.

    These former properties were deprecated (with notes about how to derive the same information from the updated schema, if possible):

    • filename - concatenate the bucket and key values with a directory separator (/)
    • name - use fileName property
    • etag - ETags are no longer provided as an individual file property. Instead, a separate etags object mapping S3 URIs to ETag values is provided as output from the following workflow tasks (guidance on how to integrate this output with your workflows is provided in the Upgrading your workflows section below):
      • update-granules-cmr-metadata-file-links
      • hyrax-metadata-updates
    • fileStagingDir - no longer supported
    • url_path - no longer supported
    • duplicate_found - This property is no longer supported, however sync-granule and move-granules now produce a separate granuleDuplicates object as part of their output. The granuleDuplicates object is a map of granules by granule ID which includes the files that encountered duplicates during processing. Guidance on how to integrate granuleDuplicates information into your workflow configuration is provided below.

    Exceptions

    These workflow tasks did not have their schema for granule files updated:

    • discover-granules - no updates
    • queue-granules - no updates
    • parse-pdr - no updates
    • sync-granule - input schema not updated, output schema was updated

    The reason that these task schemas were not updated is that all of these tasks start before the files have been ingested to S3, thus much of the information that is required in the updated files schema like bucket, key, or checksum is not yet known.

    Bulk granule operations

    Since the input schema for the above tasks was not updated, that means you cannot run bulk granule operations against workflows if they start with any of those tasks. Bulk granule operations work by loading the specified granules from the database and sending them as input to a specified workflow, so if the specified workflow begins with a task whose input schema does not conform to what is coming out of the database, there will be schema errors.

    Upgrading your deployment

    Upgrading your workflows

    For any workflows using the update-granules-cmr-metadata-file-links task before the hyrax-metadata-updates and/or post-to-cmr tasks, update the step definition for update-granules-cmr-metadata-file-links as follows:

        "UpdateGranulesCmrMetadataFileLinksStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    hyrax-metadata-updates

    For any workflows using the hyrax-metadata-updates task before a post-to-cmr task, update the definition of the hyrax-metadata-updates step as follows:

        "HyraxMetadataUpdatesTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    post-to-cmr

    For any workflows using post-to-cmr task after the update-granules-cmr-metadata-file-links or hyrax-metadata-updates tasks, update the post-to-cmr step definition as follows:

        "CmrStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}"
    }
    }
    },
    ...more configuration...

    Example workflow

    For an example workflow integrating all of these changes, please see our example ingest and publish workflow.

    Optional - Integrate granuleDuplicates information

    Please note that the granuleDuplicates output is purely informational and does not have any bearing on the separate configuration for how duplicates should be handled.

    You can include granuleDuplicates output from the sync-granule or move-granules tasks in your workflow messages like so:

        "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    ...other config...
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granuleDuplicates}",
    "destination": "{$.meta.sync_granule.granule_duplicates}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    }
    ...more configuration...

    The result of this configuration is that the granuleDuplicates output from sync-granule would be placed in meta.sync_granule.granule_duplicates on the workflow message and remain there throughout the rest of the workflow. The same configuration could be replicated for the move-granules task, but be sure to use a different destination in the workflow message for the granuleDuplicates output .

    Updating collection URL path templates

    Collections can specify url_path templates to dynamically generate the final location of files. As part of url_path templates, file object properties can be interpolated to generate the file path. Thus, these url_path templates need to be updated to ensure that they are compatible with the updated files schema and the properties that will actually be available on file objects.

    See the notes on the updated files schema to know which properties are available and which previously existing properties were deprecated.

    As an example, you will want to update any url_path properties in your collections to remove references to file.name and replace them with references to file.fileName like so:

    - "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.name, 0, 3)}",
    + "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.fileName, 0, 3)}",
    - + \ No newline at end of file diff --git a/docs/v10.1.0/upgrade-notes/upgrade-rds/index.html b/docs/v10.1.0/upgrade-notes/upgrade-rds/index.html index d19ab36f12d..7b5c3c53af1 100644 --- a/docs/v10.1.0/upgrade-notes/upgrade-rds/index.html +++ b/docs/v10.1.0/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/v10.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/v10.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html index 8d659de35b9..f4f921fa308 100644 --- a/docs/v10.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/v10.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v10.1.0

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the upgrade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflow_tasks/discover_granules/index.html b/docs/v10.1.0/workflow_tasks/discover_granules/index.html index 459453b3fc1..58d6f54da31 100644 --- a/docs/v10.1.0/workflow_tasks/discover_granules/index.html +++ b/docs/v10.1.0/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflow_tasks/files_to_granules/index.html b/docs/v10.1.0/workflow_tasks/files_to_granules/index.html index 93b660617f0..6a777daee83 100644 --- a/docs/v10.1.0/workflow_tasks/files_to_granules/index.html +++ b/docs/v10.1.0/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v10.1.0

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • size
    • bucket
    • key
    • fileName

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflow_tasks/lzards_backup/index.html b/docs/v10.1.0/workflow_tasks/lzards_backup/index.html index d9945621107..98665a59783 100644 --- a/docs/v10.1.0/workflow_tasks/lzards_backup/index.html +++ b/docs/v10.1.0/workflow_tasks/lzards_backup/index.html @@ -5,13 +5,13 @@ LZARDS Backup | Cumulus Documentation - +
    Version: v10.1.0

    LZARDS Backup

    The LZARDS backup task takes an array of granules and initiates backup requests to the LZARDS API, which will be handled asynchronously by LZARDS.

    Deployment

    The LZARDS backup task is not automatically deployed with Cumulus. To deploy the task through the Cumulus module, first you must specify a lzards_launchpad_passphrase in your terraform variables (e.g. variables.tf) like so:

    variable "lzards_launchpad_passphrase" {
    type = string
    default = ""
    }

    Then you can specify a value for your lzards_launchpad_passphrase in terraform.tfvars like so:

    lzards_launchpad_passphrase = your-passphrase

    Lastly, you need to make sure that the lzards_launchpad_passphrase is passed into the Cumulus module (in main.tf) like so:

    lzards_launchpad_passphrase  = var.lzards_launchpad_passphrase

    In short, deploying the LZARDS task requires configuring a passphrase variable and ensuring that your TF configuration passes that variable into the Cumulus module.

    Additional terraform configuration for the LZARDS task can be found in the cumulus module's variables.tf file, where the the relevant variables are prefixed with lzards_. You can add these variables to your deployment using the same process outlined above for lzards_launchpad_passphrase.

    Task Inputs

    Input

    This task expects an array of granules as input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Task Outputs

    Output

    The LZARDS task outputs a composite object containing:

    • the input granules array, and
    • a backupResults object that describes the results of LZARDS backup attempts.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflow_tasks/move_granules/index.html b/docs/v10.1.0/workflow_tasks/move_granules/index.html index a17f8c6558f..f1d33958b4b 100644 --- a/docs/v10.1.0/workflow_tasks/move_granules/index.html +++ b/docs/v10.1.0/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v10.1.0

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflow_tasks/parse_pdr/index.html b/docs/v10.1.0/workflow_tasks/parse_pdr/index.html index 03b96bc8d3b..baae2225cb5 100644 --- a/docs/v10.1.0/workflow_tasks/parse_pdr/index.html +++ b/docs/v10.1.0/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v10.1.0

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflow_tasks/queue_granules/index.html b/docs/v10.1.0/workflow_tasks/queue_granules/index.html index 3c44b98d8f9..c4760675f65 100644 --- a/docs/v10.1.0/workflow_tasks/queue_granules/index.html +++ b/docs/v10.1.0/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: v10.1.0

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflows/cumulus-task-message-flow/index.html b/docs/v10.1.0/workflows/cumulus-task-message-flow/index.html index 7a28a0bf8be..0895f192311 100644 --- a/docs/v10.1.0/workflows/cumulus-task-message-flow/index.html +++ b/docs/v10.1.0/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v10.1.0

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflows/developing-a-cumulus-workflow/index.html b/docs/v10.1.0/workflows/developing-a-cumulus-workflow/index.html index b8dd97a7d9e..4f49097d474 100644 --- a/docs/v10.1.0/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/v10.1.0/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v10.1.0

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflows/developing-workflow-tasks/index.html b/docs/v10.1.0/workflows/developing-workflow-tasks/index.html index ea535b461ca..c7d7c03696c 100644 --- a/docs/v10.1.0/workflows/developing-workflow-tasks/index.html +++ b/docs/v10.1.0/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v10.1.0

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflows/docker/index.html b/docs/v10.1.0/workflows/docker/index.html index 60d28617eba..357f64bf5dd 100644 --- a/docs/v10.1.0/workflows/docker/index.html +++ b/docs/v10.1.0/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflows/index.html b/docs/v10.1.0/workflows/index.html index 191cb64668d..146417cbff6 100644 --- a/docs/v10.1.0/workflows/index.html +++ b/docs/v10.1.0/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v10.1.0

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflows/input_output/index.html b/docs/v10.1.0/workflows/input_output/index.html index aa18cf9b85f..dc86eb87920 100644 --- a/docs/v10.1.0/workflows/input_output/index.html +++ b/docs/v10.1.0/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v10.1.0

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflows/lambda/index.html b/docs/v10.1.0/workflows/lambda/index.html index a17fcb509a4..c67521354a0 100644 --- a/docs/v10.1.0/workflows/lambda/index.html +++ b/docs/v10.1.0/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v10.1.0

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflows/protocol/index.html b/docs/v10.1.0/workflows/protocol/index.html index d192cade3d4..3251bdd098d 100644 --- a/docs/v10.1.0/workflows/protocol/index.html +++ b/docs/v10.1.0/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v10.1.0

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflows/workflow-configuration-how-to/index.html b/docs/v10.1.0/workflows/workflow-configuration-how-to/index.html index 1b24728fc6b..4ca88f9354d 100644 --- a/docs/v10.1.0/workflows/workflow-configuration-how-to/index.html +++ b/docs/v10.1.0/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -24,7 +24,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.fileName, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/v10.1.0/workflows/workflow-triggers/index.html b/docs/v10.1.0/workflows/workflow-triggers/index.html index 88243c2f887..653ead557d2 100644 --- a/docs/v10.1.0/workflows/workflow-triggers/index.html +++ b/docs/v10.1.0/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v10.1.0

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/v11.0.0/adding-a-task/index.html b/docs/v11.0.0/adding-a-task/index.html index d6d98c36788..a136b8809b7 100644 --- a/docs/v11.0.0/adding-a-task/index.html +++ b/docs/v11.0.0/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: v11.0.0

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/api/index.html b/docs/v11.0.0/api/index.html index 49838a76894..6cf322c3eee 100644 --- a/docs/v11.0.0/api/index.html +++ b/docs/v11.0.0/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v11.0.0/architecture/index.html b/docs/v11.0.0/architecture/index.html index 207a0ab5a04..33d639b106d 100644 --- a/docs/v11.0.0/architecture/index.html +++ b/docs/v11.0.0/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: v11.0.0

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a set of PostgreSQL compatible database, and is exported to an Elasticsearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries. Currently the entity state data is replicated in DynamoDB and this will be removed in a future release.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Database

    Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance. For legacy reasons, Cumulus is currently also deploying a DynamoDB datastore, and writes are replicated in parallel with the PostgreSQL database writes. The DynamoDB replicated tables and parallel writes will be removed in future releases.

    PostgreSQL Database Schema Diagram

    ERD of the Cumulus Database

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/v11.0.0/configuration/cloudwatch-retention/index.html b/docs/v11.0.0/configuration/cloudwatch-retention/index.html index 70c2a96c62d..e2568b00a5b 100644 --- a/docs/v11.0.0/configuration/cloudwatch-retention/index.html +++ b/docs/v11.0.0/configuration/cloudwatch-retention/index.html @@ -5,13 +5,13 @@ Cloudwatch Retention | Cumulus Documentation - +
    Version: v11.0.0

    Cloudwatch Retention

    Our lambdas dump logs to AWS CloudWatch. By default, these logs exist indefinitely. However, there are ways to specify a duration for log retention.

    aws-cli

    In addition to getting your aws-cli set-up, there are two values you'll need to acquire.

    1. log-group-name: the name of the log group who's retention policy (retention time) you'd like to change. We'll use /aws/lambda/KinesisInboundLogger in our examples.
    2. retention-in-days: the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    For example, if we wanted to set log retention to 30 days on our KinesisInboundLogger lambda, we would write:

    aws logs put-retention-policy --log-group-name "/aws/lambda/KinesisInboundLogger" --retention-in-days 30

    Note: The aws-cli log command that we're using is explained in detail here.

    AWS Management Console

    Changing the log retention policy in the AWS Management Console is a fairly simple process:

    1. Navigate to the CloudWatch service in the AWS Management Console.
    2. Click on the Logs entry on the sidebar.
    3. Find the Log Group who's retention policy you're interested in changing.
    4. Click on the value in the Expire Events After column.
    5. Enter/Select the number of days you'd like to retain logs in that log group for.

    Screenshot of AWS console showing how to configure the retention period for Cloudwatch logs

    - + \ No newline at end of file diff --git a/docs/v11.0.0/configuration/collection-storage-best-practices/index.html b/docs/v11.0.0/configuration/collection-storage-best-practices/index.html index 471fbcbe704..8ae5d543a6b 100644 --- a/docs/v11.0.0/configuration/collection-storage-best-practices/index.html +++ b/docs/v11.0.0/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: v11.0.0

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/v11.0.0/configuration/data-management-types/index.html b/docs/v11.0.0/configuration/data-management-types/index.html index ad886b6fd75..c5e0ad1b2e1 100644 --- a/docs/v11.0.0/configuration/data-management-types/index.html +++ b/docs/v11.0.0/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: v11.0.0

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    Please note:

    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certificateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certiciateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22
    privateKeystringNofilename assumed to be in s3://bucketInternal/stackName/crypto
    cmKeyIdstringNoAWS KMS Customer Master Key arn or alias

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.fileName, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.fileName, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v11.0.0/configuration/lifecycle-policies/index.html b/docs/v11.0.0/configuration/lifecycle-policies/index.html index c183f9cc6fb..0263c1fbe6a 100644 --- a/docs/v11.0.0/configuration/lifecycle-policies/index.html +++ b/docs/v11.0.0/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: v11.0.0

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walk-through on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/v11.0.0/configuration/monitoring-readme/index.html b/docs/v11.0.0/configuration/monitoring-readme/index.html index a02208b4321..15ff32f448f 100644 --- a/docs/v11.0.0/configuration/monitoring-readme/index.html +++ b/docs/v11.0.0/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: v11.0.0

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return await Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/configuration/server_access_logging/index.html b/docs/v11.0.0/configuration/server_access_logging/index.html index 4ddf548758e..f182f3b7880 100644 --- a/docs/v11.0.0/configuration/server_access_logging/index.html +++ b/docs/v11.0.0/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: v11.0.0

    S3 Server Access Logging

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/v11.0.0/configuration/task-configuration/index.html b/docs/v11.0.0/configuration/task-configuration/index.html index e9089e02da2..de27010c607 100644 --- a/docs/v11.0.0/configuration/task-configuration/index.html +++ b/docs/v11.0.0/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
    Version: v11.0.0

    Configuration of Tasks

    The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

    cmr_search_client_config

    Configuration parameters for CMR search client for cumulus archive module tasks in the form:

    <lambda_identifier>_report_cmr_limit = <maximum number records can be returned from cmr-client search, this should be greater than cmr_page_size>
    <lambda_identifier>_report_cmr_page_size = <number of records for each page returned from CMR>
    type = map(string)

    More information about cmr limit and cmr page_size can be found from @cumulus/cmr-client and CMR Search API document.

    Currently the following values are supported:

    • create_reconciliation_report_cmr_limit
    • create_reconciliation_report_cmr_page_size

    Example

    cmr_search_client_config = {
    create_reconciliation_report_cmr_limit = 2500
    create_reconciliation_report_cmr_page_size = 250
    }

    elasticsearch_client_config

    Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

    <lambda_identifier>_es_scroll_duration = <duration>
    <lambda_identifier>_es_scroll_size = <size>
    type = map(string)

    Currently the following values are supported:

    • create_reconciliation_report_es_scroll_duration
    • create_reconciliation_report_es_scroll_size

    Example

    elasticsearch_client_config = {
    create_reconciliation_report_es_scroll_duration = "15m"
    create_reconciliation_report_es_scroll_size = 2000
    }

    lambda_timeouts

    A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_timeout: <timeout>
    type = map(string)

    Currently the following values are supported:

    • discover_granules_task_timeout
    • discover_pdrs_task_timeout
    • hyrax_metadata_update_tasks_timeout
    • lzards_backup_task_timeout
    • move_granules_task_timeout
    • parse_pdr_task_timeout
    • pdr_status_check_task_timeout
    • post_to_cmr_task_timeout
    • queue_granules_task_timeout
    • queue_pdrs_task_timeout
    • queue_workflow_task_timeout
    • sync_granule_task_timeout
    • update_granules_cmr_metadata_file_links_task_timeout

    Example

    lambda_timeouts = {
    discover_granules_task_timeout = 300
    }
    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/about-cookbooks/index.html b/docs/v11.0.0/data-cookbooks/about-cookbooks/index.html index 69ef60c2679..dda8bcbd587 100644 --- a/docs/v11.0.0/data-cookbooks/about-cookbooks/index.html +++ b/docs/v11.0.0/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: v11.0.0

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to configure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/browse-generation/index.html b/docs/v11.0.0/data-cookbooks/browse-generation/index.html index 96b5b03ee3e..b696a15219d 100644 --- a/docs/v11.0.0/data-cookbooks/browse-generation/index.html +++ b/docs/v11.0.0/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg"
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/choice-states/index.html b/docs/v11.0.0/data-cookbooks/choice-states/index.html index 2c479003cce..cfb92f1e5fa 100644 --- a/docs/v11.0.0/data-cookbooks/choice-states/index.html +++ b/docs/v11.0.0/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: v11.0.0

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/cnm-workflow/index.html b/docs/v11.0.0/data-cookbooks/cnm-workflow/index.html index 7732d7cb46c..00f6cdb0908 100644 --- a/docs/v11.0.0/data-cookbooks/cnm-workflow/index.html +++ b/docs/v11.0.0/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v11.0.0

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/kinesis_trigger_test_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }

    Please Note:

    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

    Please note: The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/error-handling/index.html b/docs/v11.0.0/data-cookbooks/error-handling/index.html index 141d52b6489..2115970321e 100644 --- a/docs/v11.0.0/data-cookbooks/error-handling/index.html +++ b/docs/v11.0.0/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/hello-world/index.html b/docs/v11.0.0/data-cookbooks/hello-world/index.html index 269c0cd7107..d89a0aed688 100644 --- a/docs/v11.0.0/data-cookbooks/hello-world/index.html +++ b/docs/v11.0.0/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: v11.0.0

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/ingest-notifications/index.html b/docs/v11.0.0/data-cookbooks/ingest-notifications/index.html index fe821ba7042..b64ae4a741c 100644 --- a/docs/v11.0.0/data-cookbooks/ingest-notifications/index.html +++ b/docs/v11.0.0/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: v11.0.0

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics, one for executions, granules, and PDRs, are created and used for handling notification messages related to the workflow.

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates the RDS database records for granules, executions, and PDRs. When the records are updated, messages are posted to the three SNS topics. This Lambda is invoked both when the workflow starts and when it reaches a terminal state (completion or failure).

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/queue-post-to-cmr/index.html b/docs/v11.0.0/data-cookbooks/queue-post-to-cmr/index.html index 293b815109d..0c0aecd46b4 100644 --- a/docs/v11.0.0/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/v11.0.0/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: v11.0.0

    Queue PostToCmr

    In this document, we walk through handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/v11.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index 14af4c1121f..745dd640759 100644 --- a/docs/v11.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/v11.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: v11.0.0

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.7.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn
    ]
    alarms = {
    MemoryUtilizationHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 75
    }
    }
    }

    Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/sips-workflow/index.html b/docs/v11.0.0/data-cookbooks/sips-workflow/index.html index 389afbb11d8..cf90a4ff6e7 100644 --- a/docs/v11.0.0/data-cookbooks/sips-workflow/index.html +++ b/docs/v11.0.0/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/throttling-queued-executions/index.html b/docs/v11.0.0/data-cookbooks/throttling-queued-executions/index.html index 5bd01968502..4df1888cde8 100644 --- a/docs/v11.0.0/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/v11.0.0/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: v11.0.0

    Throttling queued executions

    In this entry, we will walk through how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/v11.0.0/data-cookbooks/tracking-files/index.html b/docs/v11.0.0/data-cookbooks/tracking-files/index.html index a38a6e74811..264f9e3f2bc 100644 --- a/docs/v11.0.0/data-cookbooks/tracking-files/index.html +++ b/docs/v11.0.0/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/api-gateway-logging/index.html b/docs/v11.0.0/deployment/api-gateway-logging/index.html index 4e42f221efe..0f814623fd5 100644 --- a/docs/v11.0.0/deployment/api-gateway-logging/index.html +++ b/docs/v11.0.0/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: v11.0.0

    API Gateway Logging

    Enabling API Gateway logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions for enabling account level logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    Create a policy document

    The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

    Save this snippet as apigateway-policy.json.

    {
    "Version": "2012-10-17",
    "Statement": [
    {
    "Sid": "",
    "Effect": "Allow",
    "Principal": {
    "Service": "apigateway.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
    }
    ]
    }

    Create an account role to act as ApiGateway and write to CloudWatchLogs

    NASA users in NGAP: be sure to use your account's permission boundary.

    aws iam create-role \
    --role-name ApiGatewayToCloudWatchLogs \
    [--permissions-boundary <permissionBoundaryArn>] \
    --assume-role-policy-document file://apigateway-policy.json

    Note the ARN of the returned role for the last step.

    Attach correct permissions to role

    Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

    aws iam attach-role-policy \
    --role-name ApiGatewayToCloudWatchLogs \
    --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"

    Update Account API Gateway settings with correct permissions

    Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

    aws apigateway update-account \
    --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    See Configure Cloudwatch Logs Delivery

    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/cloudwatch-logs-delivery/index.html b/docs/v11.0.0/deployment/cloudwatch-logs-delivery/index.html index 2ef9416eee9..d9d314d16ef 100644 --- a/docs/v11.0.0/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/v11.0.0/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: v11.0.0

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    Be default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/components/index.html b/docs/v11.0.0/deployment/components/index.html index 9d0078e89d7..9b4df6b99f7 100644 --- a/docs/v11.0.0/deployment/components/index.html +++ b/docs/v11.0.0/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walk-through of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/create_bucket/index.html b/docs/v11.0.0/deployment/create_bucket/index.html index 7902e6ae969..41e036d169a 100644 --- a/docs/v11.0.0/deployment/create_bucket/index.html +++ b/docs/v11.0.0/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: v11.0.0

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command line

    Using the AWS command line tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }

    Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web interface

    See: AWS "Creating a Bucket" documentation

    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/cumulus_distribution/index.html b/docs/v11.0.0/deployment/cumulus_distribution/index.html index 8bf81b2ea88..836277b4d3d 100644 --- a/docs/v11.0.0/deployment/cumulus_distribution/index.html +++ b/docs/v11.0.0/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
    Version: v11.0.0

    Using the Cumulus Distribution API

    The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

    Configuring a Cumulus Distribution deployment

    The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

    These steps assume you're using the Cumulus Deployment Template but can also be used for custom deployments.

    To configure a deployment to use Cumulus Distribution:

    1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the Cumulus Distribution settings.
    2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
    3. Uncomment the Cumulus Distribution outputs in outputs.tf.
    4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

    Cognito Application and User Credentials

    The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

    Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

    Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

    • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
    • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
    • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

    Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

    Cumulus Distribution URL

    Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

    By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

    In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

    You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file, and setting it to one of the following values (both of which are explained below):

    1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
    2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

    The following subsections explain these approaches, in turn.

    Using your Cumulus Distribution API Gateway URL as your distribution URL

    Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development), as described here. Here is an outline of the required steps, with details provided further below:

    1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
    2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
    3. Choose an open local port on your machine (we'll use 9000 in the following details)
    4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
    5. Redeploy Cumulus
    6. Add an entry to your /etc/hosts file
    7. Add a redirect URI to Cognito, via the Cognito API
    8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
    9. Add a sample file to S3 to test downloading via Cognito

    To create or import an existing key pair, you can use the AWS CLI (see aws ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

    Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

    key_name = "<name>"
    cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

    where:

    • <name> is the name of the key pair you just added to AWS
    • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
    • <port> is your open local port of choice (9000 is typically a good choice)

    Once you save your variable changes, redeploy your cumulus-tf module.

    While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

    localhost <hostname>

    Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

    • method: POST
    • base URL: the value of your csdap_host_url Terraform variable
    • path: /authclient/updateRedirectUri
    • username: the value of your csdap_client_id Terraform variable
    • password: the value of your csdap_client_password Terraform variable
    • headers: Content-Type='application/x-www-form-urlencoded'
    • body: redirect_uri=<cumulus_distribution_url>/login

    where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

    For reference, see the Cognito Authentication Service API.

    Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

    brew install --cask session-manager-plugin --no-quarantine

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

    1. Determine your ec2 instance ID
    2. Connect to the NASA VPN
    3. Start an AWS SSM session
    4. Open an ssh tunnel
    5. Use a browser to navigate to your file

    To determine your ec2 instance ID for your Cumulus deployment, run the follow command, where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

    aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text

    IMPORTANT: Before proceeding with the remaining steps, make sure you're connected to the NASA VPN.

    Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

    aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

    If successful, you should see output similar to the following:

    Starting session with SessionId: NGAPShApplicationDeveloper-***
    Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
    Waiting for connections...

    Open another terminal window, and open a tunnel with port forwarding, using your chosen port from above (e.g., 9000):

    ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

    where:

    • <port> is the open local port you chose earlier (e.g., 9000)
    • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    Once you're finished testing, clean up as follows:

    1. Kill your ssh tunnel (Ctrl-C)
    2. Kill your AWS SSM session (Ctrl-C)
    3. If you like, disconnect from the NASA VPC

    While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

    Using a CloudFront URL as your distribution URL

    In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

    This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

    • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
    • A request to add the AWS account's VPC to the whitelist

    Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

    cumulus_distribution_url = <cloudfront_url>

    In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

    At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

    Redeploy Cumulus with your new/updated Terraform variables.

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    S3 Bucket Mapping

    An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

    The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Note: Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Switching from the Thin Egress App to Cumulus Distribution

    If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

    Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

    There will also be downtime while your API gateway is updated.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/index.html b/docs/v11.0.0/deployment/index.html index 770c80d0a95..48a11f1628d 100644 --- a/docs/v11.0.0/deployment/index.html +++ b/docs/v11.0.0/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -21,7 +21,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Choose a distribution API

    Cumulus can be configured to use either the Thin Egress App (TEA) or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

    IMPORTANT! If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    Configure the Thin Egress App

    The Thin Egress App can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the documentation to configure distribution in your cumulus-tf deployment.

    Configure the Cumulus Distribution API (optional)

    If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

    Note: Be sure to copy the redirect URLs, as you will use them to update your Earthdata application.

    Update Earthdata Application

    You will need to add two redirect URLs to your EarthData login application.

    1. Login to URS.
    2. Under My Applications -> Application Administration -> use the edit icon of your application.
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token.
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1.
    5. You may delete the placeholder url you used to create the application.

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus dashboard

    Dashboard Requirements

    Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the dashboard repository.

    Prepare AWS

    Create S3 bucket for dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install dashboard

    To install the dashboard, clone the Cumulus dashboard repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard versioning

    By default, the master branch will be used for dashboard deployments. The master branch of the dashboard repo contains the most recent stable release of the dashboard.

    If you want to test unreleased changes to the dashboard, use the develop branch.

    Each release/version of the dashboard will have a tag in the dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the dashboard

    Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build the dashboard from the dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard deployment

    Deploy dashboard to s3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard --acl public-read

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and login with a user that you configured for access in the Configure and Deploy the Cumulus Stack step.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 instances and autoscaling groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/postgres_database_deployment/index.html b/docs/v11.0.0/deployment/postgres_database_deployment/index.html index 181cead672d..c953f8fe923 100644 --- a/docs/v11.0.0/deployment/postgres_database_deployment/index.html +++ b/docs/v11.0.0/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 10.2 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases, and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs, and the Aurora Serverless V1 docs.

    Prepare deployment repository

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a new repository

    Create a new repository on Github so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

    --

    Configure and deploy the module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc. This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision user and user database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true, this configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to
    • prefix -- this value will be used to set a unique identifier the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda, and run it on each provision, creating the configured database if it does not exist, updating the user password if that value has been changed, and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    Please Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see output like:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secret required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are is in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Next Steps

    The database cluster has been created/updated! From here you can continue to add additional user accounts, databases and other database configuration.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/share-s3-access-logs/index.html b/docs/v11.0.0/deployment/share-s3-access-logs/index.html index 7b42e296eab..9ebc02e190d 100644 --- a/docs/v11.0.0/deployment/share-s3-access-logs/index.html +++ b/docs/v11.0.0/deployment/share-s3-access-logs/index.html @@ -5,14 +5,14 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: v11.0.0

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a node package that contains a simple lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First ensure that you have enabled S3 Server Access Logging.

    Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition. e.g.

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The terraform source package can be found on the Cumulus github release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 replicator as described above using the target_bucket and target_prefix provided by the metrics team.

    The metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/terraform-best-practices/index.html b/docs/v11.0.0/deployment/terraform-best-practices/index.html index 5cbad0784c1..80fbc716ac7 100644 --- a/docs/v11.0.0/deployment/terraform-best-practices/index.html +++ b/docs/v11.0.0/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    Configuring the Cumulus deployment: link Restoring a previous version: link

    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/thin_egress_app/index.html b/docs/v11.0.0/deployment/thin_egress_app/index.html index e5b2de9dbcf..d9e2afd2499 100644 --- a/docs/v11.0.0/deployment/thin_egress_app/index.html +++ b/docs/v11.0.0/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v11.0.0

    Using the Thin Egress App for Cumulus distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a secret for signing Thin Egress App JWTs

    The Thin Egress App uses JWTs internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Please note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally configure a custom bucket map

    A simple config would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public

    Please note: your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally configure shared variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus core example deployment code.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/deployment/upgrade-readme/index.html b/docs/v11.0.0/deployment/upgrade-readme/index.html index 6ae22ff27b1..09d8381405a 100644 --- a/docs/v11.0.0/deployment/upgrade-readme/index.html +++ b/docs/v11.0.0/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/development/forked-pr/index.html b/docs/v11.0.0/development/forked-pr/index.html index 9b09c391791..dd8bd0d43b1 100644 --- a/docs/v11.0.0/development/forked-pr/index.html +++ b/docs/v11.0.0/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: v11.0.0

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/development/integration-tests/index.html b/docs/v11.0.0/development/integration-tests/index.html index 9868da2265f..51e61492080 100644 --- a/docs/v11.0.0/development/integration-tests/index.html +++ b/docs/v11.0.0/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/development/quality-and-coverage/index.html b/docs/v11.0.0/development/quality-and-coverage/index.html index 8ecf0dadf88..5eb4a421546 100644 --- a/docs/v11.0.0/development/quality-and-coverage/index.html +++ b/docs/v11.0.0/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/development/release/index.html b/docs/v11.0.0/development/release/index.html index 81da20e0962..23cf8a6642f 100644 --- a/docs/v11.0.0/development/release/index.html +++ b/docs/v11.0.0/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -15,7 +15,7 @@ It's useful to use the search feature of your code editor or grep to see if there any references to the old package versions. In bash shell you can run

    find . -name package.json -exec grep -nH "@cumulus/.*MAJOR\.MINOR\.PATCH.*" {} \;

    Verify that each of those is updated to the new MAJOR.MINOR.PATCH verion you are trying to release.

    A similar search for alpha and beta versions should be run on the release version and any problems should be fixed.

    find . -name package.json -exec grep -nHE "MAJOR\.MINOR\.PATCH.*(alpha|beta)" {} \;

    3. Check Cumulus Dashboard PRs for Version Bump

    There may be unreleased changes in the Cumulus Dashboard project that rely on this unreleased Cumulus Core version.

    If there is exists a PR in the cumulus-dashboard repo with a name containing: "Version Bump for Next Cumulus API Release":

    • There will be a placeholder change-me value that should be replaced with the Cumulus Core to-be-released-version.
    • Mark that PR as ready to be reviewed.

    4. Update CHANGELOG.md

    Update the CHANGELOG.md. Put a header under the Unreleased section with the new version number and the date.

    Add a link reference for the github "compare" view at the bottom of the CHANGELOG.md, following the existing pattern. This link reference should create a link in the CHANGELOG's release header to changes in the corresponding release.

    5. Update DATA_MODEL_CHANGELOG.md

    Similar to #4, make sure the DATA_MODEL_CHANGELOG is updated if there are data model changes in the release, and the link reference at the end of the document is updated as appropriate.

    6. Update CONTRIBUTORS.md

    ./bin/update-contributors.sh
    git add CONTRIBUTORS.md

    Commit and push these changes, if any.

    7. Update Cumulus package API documentation

    Update auto-generated API documentation for any Cumulus packages that have it:

    npm run docs-build-packages

    Commit and push these changes, if any.

    8. Cut new version of Cumulus Documentation

    If this is a backport, do not create a new version of the documentation. For various reasons, we do not merge backports back to master, other than changelog notes. Documentation changes for backports will not be published to our documentation website.

    cd website
    npm run version ${release_version}
    git add .

    Where ${release_version} corresponds to the version tag v1.2.3, for example.

    Commit and push these changes.

    9. Create a pull request against the minor version branch

    1. Push the release branch (e.g. release-1.2.3) to GitHub.

    2. Create a PR against the minor version base branch (e.g. release-1.2.x).

    3. Configure Bamboo to run automated tests against this PR by finding the branch plan for the release branch (release-1.2.3) and setting only these variables:

      • GIT_PR: true
      • SKIP_AUDIT: true

      IMPORTANT: Do NOT set the PUBLISH_FLAG variable to true for this branch plan. The actual publishing of the release will be handled by a separate, manually triggered branch plan.

      Screenshot of Bamboo CI interface showing the configuration of the GIT_PR branch variable to have a value of &quot;true&quot;

    4. Verify that the Bamboo build for the PR succeeds and then merge to the minor version base branch (release-1.2.x).

      • It is safe to do a squash merge in this instance, but not required
    5. You may delete your release branch (release-1.2.3) after merging to the base branch.

    10. Create a git tag for the release

    Check out the minor version base branch (release-1.2.x) now that your changes are merged in and do a git pull.

    Ensure you are on the latest commit.

    Create and push a new git tag:

        git tag -a vMAJOR.MINOR.PATCH -m "Release MAJOR.MINOR.PATCH"
    git push origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -a v9.1.0 -m "Release 9.1.0"
    git push origin v9.1.0

    11. Publishing the release

    Publishing of new releases is handled by a custom Bamboo branch plan and is manually triggered.

    The reasons for using a separate branch plan to handle releases instead of the branch plan for the minor version (e.g. release-1.2.x) are:

    • The Bamboo build for the minor version release branch is triggered automatically on any commits to that branch, whereas we want to manually control when the release is published.
    • We want to verify that integration tests have passed on the Bamboo build for the minor version release branch before we manually trigger the release, so that we can be sure that our code is safe to release.

    If this is a new minor version branch, then you will need to create a new Bamboo branch plan for publishing the release following the instructions below:

    Creating a Bamboo branch plan for the release

    • In the Cumulus Core project (https://ci.earthdata.nasa.gov/browse/CUM-CBA), click Actions -> Configure Plan in the top right.

    • Next to Plan branch click the rightmost button that displays Create Plan Branch upon hover.

    • Click Create plan branch manually.

    • Add the values in that list. Choose a display name that makes it very clear this is a deployment branch plan. Release (minor version branch name) seems to work well (e.g. Release (1.2.x))).

      • Make sure you enter the correct branch name (e.g. release-1.2.x).
    • Important Deselect Enable Branch - if you do not do this, it will immediately fire off a build.

    • Do Immediately On the Branch Details page, enable Change trigger. Set the Trigger type to manual, this will prevent commits to the branch from triggering the build plan. You should have been redirected to the Branch Details tab after creating the plan. If not, navigate to the branch from the list where you clicked Create Plan Branch in the previous step.

    • Go to the Variables tab. Ensure that you are on your branch plan and not the master plan: You should not see a large list of configured variables, but instead a dropdown allowing you to select variables to override, and the tab title will be Branch Variables. Then set the branch variables as follow:

      • DEPLOYMENT: cumulus-from-npm-tf (except in special cases such as incompatible backport branches)
        • If this variable is not set, it will default to the deployment name for the last committer on the branch
      • USE_CACHED_BOOTSTRAP: false
      • USE_TERRAFORM_ZIPS: true (IMPORTANT: MUST be set in order to run integration tests against the .zip files published during the build so that we are actually testing our released files)
      • GIT_PR: true
      • SKIP_AUDIT: true
      • PUBLISH_FLAG: true
    • Enable the branch from the Branch Details page.

    • Run the branch using the Run button in the top right.

    Bamboo will build and run lint and unit tests against that tagged release, publish the new packages to NPM, and then run the integration tests using those newly released packages.

    12. Create a new Cumulus release on github

    The CI release scripts will automatically create a GitHub release based on the release version tag, as well as upload artifacts to the Github release for the Terraform modules provided by Cumulus. The Terraform release artifacts include:

    • A multi-module Terraform .zip artifact containing filtered copies of the tf-modules, packages, and tasks directories for use as Terraform module sources.
    • A S3 replicator module
    • A workflow module
    • A distribution API module
    • An ECS service module

    Just make sure to verify the appropriate .zip files are present on Github after the release process is complete.

    13. Merge base branch back to master

    Finally, you need to reproduce the version update changes back to master.

    If this is the latest version, you can simply create a PR to merge the minor version base branch back to master.

    Do not merge master back into the release branch since we want the release branch to just have the code from the release. Instead, create a new branch off of the release branch and merge that to master. You can freely merge master into this branch and delete it when it is merged to master.

    If this is a backport, you will need to create a PR that ports the changelog updates back to master. It is important in this changelog note to call it out as a backport. For example, fixes in backport version 1.14.5 may not be available in 1.15.0 because the fix was introduced in 1.15.3.

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d vMAJOR.MINOR.PATCH
    git push -d origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -d v9.1.0
    git push -d origin v9.1.0
    - + \ No newline at end of file diff --git a/docs/v11.0.0/docs-how-to/index.html b/docs/v11.0.0/docs-how-to/index.html index 7c929323271..1b826c846c5 100644 --- a/docs/v11.0.0/docs-how-to/index.html +++ b/docs/v11.0.0/docs-how-to/index.html @@ -5,13 +5,13 @@ Cumulus Documentation: How To's | Cumulus Documentation - +
    Version: v11.0.0

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve

    Note: docs-build will build the documents into website/build.

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---

    Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walk-through can be found here. It is worth noting that we would like the Documentation versions to match up directly with release versions. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiKey and an indexName by DocSearch that we include in our website/siteConfig.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for both these values to exist - DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/external-contributions/index.html b/docs/v11.0.0/external-contributions/index.html index 4a72e0b6d6e..e29eaffd192 100644 --- a/docs/v11.0.0/external-contributions/index.html +++ b/docs/v11.0.0/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: v11.0.0

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/faqs/index.html b/docs/v11.0.0/faqs/index.html index 5a1637a9072..fda6ef12397 100644 --- a/docs/v11.0.0/faqs/index.html +++ b/docs/v11.0.0/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: v11.0.0

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    What prerequisites are needed to setup Cumulus?

    Answer: You will need access to the AWS console and an Earthdata login before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a JIRA issue.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    Is Cumulus customizable and flexible?

    Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

    What are Terraform modules?

    Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

    Where do I find Terraform module variables

    Answer: Go here for a list of Cumulus maintained variables.

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting here.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/features/ancillary_metadata/index.html b/docs/v11.0.0/features/ancillary_metadata/index.html index e0b4472f0f8..ddcee3e194c 100644 --- a/docs/v11.0.0/features/ancillary_metadata/index.html +++ b/docs/v11.0.0/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v11.0.0

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/features/backup_and_restore/index.html b/docs/v11.0.0/features/backup_and_restore/index.html index f537b470663..53cb272fd1c 100644 --- a/docs/v11.0.0/features/backup_and_restore/index.html +++ b/docs/v11.0.0/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -52,7 +52,7 @@ writing to the old cluster.

  • Set the snapshot_identifier variable to the snapshot you wish to create, and configure the module like a new deployment, with a unique cluster_identifier

  • Deploy the module using terraform apply

  • Once deployed, verify the cluster has the expected data

  • Redeploy the data persistence and Cumulus deployments - You should not need to reconfigure either, as the secret ARN and the security group should not change, however double-check the configured values are as expected

  • - + \ No newline at end of file diff --git a/docs/v11.0.0/features/dead_letter_archive/index.html b/docs/v11.0.0/features/dead_letter_archive/index.html index 5c655436d28..695e1fcc5e2 100644 --- a/docs/v11.0.0/features/dead_letter_archive/index.html +++ b/docs/v11.0.0/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: v11.0.0

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/features/dead_letter_queues/index.html b/docs/v11.0.0/features/dead_letter_queues/index.html index e850b91d122..20cadcf9b28 100644 --- a/docs/v11.0.0/features/dead_letter_queues/index.html +++ b/docs/v11.0.0/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v11.0.0

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/v11.0.0/features/distribution-metrics/index.html b/docs/v11.0.0/features/distribution-metrics/index.html index a40ac4a6c3e..5165eb09053 100644 --- a/docs/v11.0.0/features/distribution-metrics/index.html +++ b/docs/v11.0.0/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v11.0.0

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/v11.0.0/features/execution_payload_retention/index.html b/docs/v11.0.0/features/execution_payload_retention/index.html index e73db8d1aae..bf244de27c5 100644 --- a/docs/v11.0.0/features/execution_payload_retention/index.html +++ b/docs/v11.0.0/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v11.0.0

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in your RDS database and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/features/logging-esdis-metrics/index.html b/docs/v11.0.0/features/logging-esdis-metrics/index.html index a01cbfa1d10..aaaec5c4ce1 100644 --- a/docs/v11.0.0/features/logging-esdis-metrics/index.html +++ b/docs/v11.0.0/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v11.0.0

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/features/replay-archived-sqs-messages/index.html b/docs/v11.0.0/features/replay-archived-sqs-messages/index.html index d7986f77922..611d31cadad 100644 --- a/docs/v11.0.0/features/replay-archived-sqs-messages/index.html +++ b/docs/v11.0.0/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: v11.0.0

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/features/replay-kinesis-messages/index.html b/docs/v11.0.0/features/replay-kinesis-messages/index.html index 1f5dcb3322f..dfb65aa755d 100644 --- a/docs/v11.0.0/features/replay-kinesis-messages/index.html +++ b/docs/v11.0.0/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v11.0.0

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/features/reports/index.html b/docs/v11.0.0/features/reports/index.html index b620f10f67b..a9635768530 100644 --- a/docs/v11.0.0/features/reports/index.html +++ b/docs/v11.0.0/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -19,7 +19,7 @@ report generation. The data buckets will include any buckets in your Cumulus buckets configuration that have type public, protected or private.
    - + \ No newline at end of file diff --git a/docs/v11.0.0/getting-started/index.html b/docs/v11.0.0/getting-started/index.html index bf772064337..8bd26f763e4 100644 --- a/docs/v11.0.0/getting-started/index.html +++ b/docs/v11.0.0/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v11.0.0

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Deploy Cumulus and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    2. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    3. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment. For an introduction about Terraform go here.
    • Integrator Common Use Cases: Scenarios to help integrators along in the Cumulus environment.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/v11.0.0/glossary/index.html b/docs/v11.0.0/glossary/index.html index 52552613adb..ffac86d4b17 100644 --- a/docs/v11.0.0/glossary/index.html +++ b/docs/v11.0.0/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: v11.0.0

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: aws.amazon.com

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see cookbook reference page.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Npm

    Node package manager.

    For more information, see npmjs.com.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (NPM)

    NPM hosted node.js packages. Cumulus packages can be found on NPM's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data and more.

    For more information, see AWS's s3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/index.html b/docs/v11.0.0/index.html index 7a8ad491ed4..89f47eb088a 100644 --- a/docs/v11.0.0/index.html +++ b/docs/v11.0.0/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v11.0.0

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/integrator-guide/about-int-guide/index.html b/docs/v11.0.0/integrator-guide/about-int-guide/index.html index e759464559b..58243631691 100644 --- a/docs/v11.0.0/integrator-guide/about-int-guide/index.html +++ b/docs/v11.0.0/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v11.0.0

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/v11.0.0/integrator-guide/int-common-use-cases/index.html b/docs/v11.0.0/integrator-guide/int-common-use-cases/index.html index f8110f36f17..0e887954148 100644 --- a/docs/v11.0.0/integrator-guide/int-common-use-cases/index.html +++ b/docs/v11.0.0/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v11.0.0/integrator-guide/workflow-add-new-lambda/index.html b/docs/v11.0.0/integrator-guide/workflow-add-new-lambda/index.html index cbc647534e1..027aa30a0b5 100644 --- a/docs/v11.0.0/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/v11.0.0/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v11.0.0

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/integrator-guide/workflow-ts-failed-step/index.html b/docs/v11.0.0/integrator-guide/workflow-ts-failed-step/index.html index 746fb3e772a..1b6cee1e473 100644 --- a/docs/v11.0.0/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/v11.0.0/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v11.0.0

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/v11.0.0/interfaces/index.html b/docs/v11.0.0/interfaces/index.html index cc79b060af4..abd7e7b1c06 100644 --- a/docs/v11.0.0/interfaces/index.html +++ b/docs/v11.0.0/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v11.0.0

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/about-operator-docs/index.html b/docs/v11.0.0/operator-docs/about-operator-docs/index.html index bd95561754f..9ede4d7c536 100644 --- a/docs/v11.0.0/operator-docs/about-operator-docs/index.html +++ b/docs/v11.0.0/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v11.0.0

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/bulk-operations/index.html b/docs/v11.0.0/operator-docs/bulk-operations/index.html index 3d23dde5b34..e0a47e630fe 100644 --- a/docs/v11.0.0/operator-docs/bulk-operations/index.html +++ b/docs/v11.0.0/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v11.0.0

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/cmr-operations/index.html b/docs/v11.0.0/operator-docs/cmr-operations/index.html index 70778215795..4e377e8a5ec 100644 --- a/docs/v11.0.0/operator-docs/cmr-operations/index.html +++ b/docs/v11.0.0/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/create-rule-in-cumulus/index.html b/docs/v11.0.0/operator-docs/create-rule-in-cumulus/index.html index b25bee4f9c9..a9f0ec568fd 100644 --- a/docs/v11.0.0/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/v11.0.0/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v11.0.0

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/discovery-filtering/index.html b/docs/v11.0.0/operator-docs/discovery-filtering/index.html index 2b28c6302b7..8e470d4bfa5 100644 --- a/docs/v11.0.0/operator-docs/discovery-filtering/index.html +++ b/docs/v11.0.0/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/granule-workflows/index.html b/docs/v11.0.0/operator-docs/granule-workflows/index.html index 2b7fc0bfaac..57f6d1f47e0 100644 --- a/docs/v11.0.0/operator-docs/granule-workflows/index.html +++ b/docs/v11.0.0/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v11.0.0

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/kinesis-stream-for-ingest/index.html b/docs/v11.0.0/operator-docs/kinesis-stream-for-ingest/index.html index bbd140c5786..aa224494ed5 100644 --- a/docs/v11.0.0/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/v11.0.0/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v11.0.0

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/locating-access-logs/index.html b/docs/v11.0.0/operator-docs/locating-access-logs/index.html index 0bba4643d21..30690fb2fc4 100644 --- a/docs/v11.0.0/operator-docs/locating-access-logs/index.html +++ b/docs/v11.0.0/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v11.0.0

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/naming-executions/index.html b/docs/v11.0.0/operator-docs/naming-executions/index.html index 523fb046581..0ff0e8a05c1 100644 --- a/docs/v11.0.0/operator-docs/naming-executions/index.html +++ b/docs/v11.0.0/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Please note: This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/ops-common-use-cases/index.html b/docs/v11.0.0/operator-docs/ops-common-use-cases/index.html index 8d66332c9b0..f1b3cb01d25 100644 --- a/docs/v11.0.0/operator-docs/ops-common-use-cases/index.html +++ b/docs/v11.0.0/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v11.0.0/operator-docs/trigger-workflow/index.html b/docs/v11.0.0/operator-docs/trigger-workflow/index.html index d6a3c6ced1e..56e271aa895 100644 --- a/docs/v11.0.0/operator-docs/trigger-workflow/index.html +++ b/docs/v11.0.0/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v11.0.0

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/tasks/index.html b/docs/v11.0.0/tasks/index.html index 15710190002..96a5d0a9e0a 100644 --- a/docs/v11.0.0/tasks/index.html +++ b/docs/v11.0.0/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v11.0.0

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/v11.0.0/team/index.html b/docs/v11.0.0/team/index.html index 1b609569f5f..f8c8809b557 100644 --- a/docs/v11.0.0/team/index.html +++ b/docs/v11.0.0/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v11.0.0/troubleshooting/index.html b/docs/v11.0.0/troubleshooting/index.html index ef32b6b5e1b..32390f49e5c 100644 --- a/docs/v11.0.0/troubleshooting/index.html +++ b/docs/v11.0.0/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v11.0.0

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/troubleshooting/reindex-elasticsearch/index.html b/docs/v11.0.0/troubleshooting/reindex-elasticsearch/index.html index 50457a2a15d..7a75df97c61 100644 --- a/docs/v11.0.0/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/v11.0.0/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/troubleshooting/rerunning-workflow-executions/index.html b/docs/v11.0.0/troubleshooting/rerunning-workflow-executions/index.html index 9245a98b440..b3fadcd2f44 100644 --- a/docs/v11.0.0/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/v11.0.0/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v11.0.0

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/v11.0.0/troubleshooting/troubleshooting-deployment/index.html b/docs/v11.0.0/troubleshooting/troubleshooting-deployment/index.html index 013f698d8d7..6a91239ba2b 100644 --- a/docs/v11.0.0/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/v11.0.0/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/v11.0.0/upgrade-notes/cumulus_distribution_migration/index.html b/docs/v11.0.0/upgrade-notes/cumulus_distribution_migration/index.html index 3bab3d6ccfa..8ba3fee5a01 100644 --- a/docs/v11.0.0/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/v11.0.0/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: v11.0.0

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/upgrade-notes/migrate_tea_standalone/index.html b/docs/v11.0.0/upgrade-notes/migrate_tea_standalone/index.html index a44247f8dd0..f1d59c1cb39 100644 --- a/docs/v11.0.0/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/v11.0.0/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v11.0.0

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/upgrade-notes/update-cma-2.0.2/index.html b/docs/v11.0.0/upgrade-notes/update-cma-2.0.2/index.html index e2dd5a76c3d..2a6324f7e26 100644 --- a/docs/v11.0.0/upgrade-notes/update-cma-2.0.2/index.html +++ b/docs/v11.0.0/upgrade-notes/update-cma-2.0.2/index.html @@ -5,13 +5,13 @@ Upgrade to CMA 2.0.2 | Cumulus Documentation - +
    Version: v11.0.0

    Upgrade to CMA 2.0.2

    Updating a Cumulus Deployment to CMA 2.0.2

    Background

    The Cumulus Message Adapter has been updated in release 2.0.2 to no longer utilize the AWS step function API to look up the defined name of a step function task for population in meta.workflow_tasks, but instead use an incrementing integer field.

    Additionally a bugfix was released in the form of v2.0.1/v2.0.2 following the initial 2.0.0 release, so all users should update to release 2.0.2

    The update is not tied to a particular version of Core, however the update should be done across all task components in order to ensure consistent execution records.

    Changes

    Execution Record Update

    This update functionally means that Cumulus tasks/activities using the CMA will now record a record that looks like the following in meta.workflowtasks, and more importantly in the tasks column for an execution record:

    Original

          "DiscoverGranules": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "QueueGranules": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    New

          "0": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "1": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    Actions Required

    The following should be done as part of a Cumulus stack update to utilize cumulus message adapter > 2.0.2:

    • Python tasks that utilize cumulus-message-adapter-python should be updated to use > 2.0.0, their lambdas rebuilt and Cumulus workflows reconfigured to use the updated version.

    • Python activities that utilize cumulus-process-py should be rebuilt using > 1.0.0 with updated dependencies, and have their images deployed/Cumulus configured to use the new version.

    • The cumulus-message-adapter v2.0.2 lambda layer should be made available in the deployment account, and the Cumulus deployment should be reconfigured to use it (via the cumulus_message_adapter_lambda_layer_version_arn variable in the cumulus module). This should address all Core node.js tasks that utilize the CMA, and many contributed node.js/JAVA components.

    Once the above have been done, redeploy Cumulus to apply the configuration and the updates should be live.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/upgrade-notes/update-task-file-schemas/index.html b/docs/v11.0.0/upgrade-notes/update-task-file-schemas/index.html index 5b914b76a59..416481f0b67 100644 --- a/docs/v11.0.0/upgrade-notes/update-task-file-schemas/index.html +++ b/docs/v11.0.0/upgrade-notes/update-task-file-schemas/index.html @@ -5,13 +5,13 @@ Updates to task granule file schemas | Cumulus Documentation - +
    Version: v11.0.0

    Updates to task granule file schemas

    Background

    Most Cumulus workflow tasks expect as input a payload of granule(s) which contain the files for each granule. Most tasks also return this same granule structure as output.

    However, up to this point, there was inconsistency in the schemas for the granule files objects expected by each task. Furthermore, there was no guarantee of consistency between granule files objects as stored in the database and the expectations of any given workflow task.

    Thus, when performing bulk granule operations which pass granules from the database into a Cumulus workflow, it was possible for there to be schema validation failures depending on which task was used to start the workflow and its particular schema.

    In order to rectify this situation, CUMULUS-2388 was filed and addressed to create a common granule files schema between nearly all of the Cumulus tasks (exceptions discussed below) and the Cumulus database. The following documentation explains the manual changes you need to make to your deployment in order to be compatible with the updated files schema.

    Updated files schema

    The updated granule files schema can be found here.

    These former properties were deprecated (with notes about how to derive the same information from the updated schema, if possible):

    • filename - concatenate the bucket and key values with a directory separator (/)
    • name - use fileName property
    • etag - ETags are no longer provided as an individual file property. Instead, a separate etags object mapping S3 URIs to ETag values is provided as output from the following workflow tasks (guidance on how to integrate this output with your workflows is provided in the Upgrading your workflows section below):
      • update-granules-cmr-metadata-file-links
      • hyrax-metadata-updates
    • fileStagingDir - no longer supported
    • url_path - no longer supported
    • duplicate_found - This property is no longer supported, however sync-granule and move-granules now produce a separate granuleDuplicates object as part of their output. The granuleDuplicates object is a map of granules by granule ID which includes the files that encountered duplicates during processing. Guidance on how to integrate granuleDuplicates information into your workflow configuration is provided below.

    Exceptions

    These workflow tasks did not have their schema for granule files updated:

    • discover-granules - no updates
    • queue-granules - no updates
    • parse-pdr - no updates
    • sync-granule - input schema not updated, output schema was updated

    The reason that these task schemas were not updated is that all of these tasks start before the files have been ingested to S3, thus much of the information that is required in the updated files schema like bucket, key, or checksum is not yet known.

    Bulk granule operations

    Since the input schema for the above tasks was not updated, that means you cannot run bulk granule operations against workflows if they start with any of those tasks. Bulk granule operations work by loading the specified granules from the database and sending them as input to a specified workflow, so if the specified workflow begins with a task whose input schema does not conform to what is coming out of the database, there will be schema errors.

    Upgrading your deployment

    Upgrading your workflows

    For any workflows using the update-granules-cmr-metadata-file-links task before the hyrax-metadata-updates and/or post-to-cmr tasks, update the step definition for update-granules-cmr-metadata-file-links as follows:

        "UpdateGranulesCmrMetadataFileLinksStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    hyrax-metadata-updates

    For any workflows using the hyrax-metadata-updates task before a post-to-cmr task, update the definition of the hyrax-metadata-updates step as follows:

        "HyraxMetadataUpdatesTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    post-to-cmr

    For any workflows using post-to-cmr task after the update-granules-cmr-metadata-file-links or hyrax-metadata-updates tasks, update the post-to-cmr step definition as follows:

        "CmrStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}"
    }
    }
    },
    ...more configuration...

    Example workflow

    For an example workflow integrating all of these changes, please see our example ingest and publish workflow.

    Optional - Integrate granuleDuplicates information

    Please note that the granuleDuplicates output is purely informational and does not have any bearing on the separate configuration for how duplicates should be handled.

    You can include granuleDuplicates output from the sync-granule or move-granules tasks in your workflow messages like so:

        "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    ...other config...
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granuleDuplicates}",
    "destination": "{$.meta.sync_granule.granule_duplicates}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    }
    ...more configuration...

    The result of this configuration is that the granuleDuplicates output from sync-granule would be placed in meta.sync_granule.granule_duplicates on the workflow message and remain there throughout the rest of the workflow. The same configuration could be replicated for the move-granules task, but be sure to use a different destination in the workflow message for the granuleDuplicates output .

    Updating collection URL path templates

    Collections can specify url_path templates to dynamically generate the final location of files. As part of url_path templates, file object properties can be interpolated to generate the file path. Thus, these url_path templates need to be updated to ensure that they are compatible with the updated files schema and the properties that will actually be available on file objects.

    See the notes on the updated files schema to know which properties are available and which previously existing properties were deprecated.

    As an example, you will want to update any url_path properties in your collections to remove references to file.name and replace them with references to file.fileName like so:

    - "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.name, 0, 3)}",
    + "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.fileName, 0, 3)}",
    - + \ No newline at end of file diff --git a/docs/v11.0.0/upgrade-notes/upgrade-rds/index.html b/docs/v11.0.0/upgrade-notes/upgrade-rds/index.html index e1466b41469..e29af2a188f 100644 --- a/docs/v11.0.0/upgrade-notes/upgrade-rds/index.html +++ b/docs/v11.0.0/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/v11.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/v11.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html index c38d89d34de..455d19b8718 100644 --- a/docs/v11.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/v11.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v11.0.0

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the upgrade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflow_tasks/discover_granules/index.html b/docs/v11.0.0/workflow_tasks/discover_granules/index.html index 89d6a54f9ce..e27e7e19956 100644 --- a/docs/v11.0.0/workflow_tasks/discover_granules/index.html +++ b/docs/v11.0.0/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflow_tasks/files_to_granules/index.html b/docs/v11.0.0/workflow_tasks/files_to_granules/index.html index 9aa21ab773e..32ba17f0c10 100644 --- a/docs/v11.0.0/workflow_tasks/files_to_granules/index.html +++ b/docs/v11.0.0/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v11.0.0

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • size
    • bucket
    • key
    • fileName

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflow_tasks/lzards_backup/index.html b/docs/v11.0.0/workflow_tasks/lzards_backup/index.html index 7cb9021c7e7..6fc13a9a607 100644 --- a/docs/v11.0.0/workflow_tasks/lzards_backup/index.html +++ b/docs/v11.0.0/workflow_tasks/lzards_backup/index.html @@ -5,13 +5,13 @@ LZARDS Backup | Cumulus Documentation - +
    Version: v11.0.0

    LZARDS Backup

    The LZARDS backup task takes an array of granules and initiates backup requests to the LZARDS API, which will be handled asynchronously by LZARDS.

    Deployment

    The LZARDS backup task is not automatically deployed with Cumulus. To deploy the task through the Cumulus module, first you must specify a lzards_launchpad_passphrase in your terraform variables (e.g. variables.tf) like so:

    variable "lzards_launchpad_passphrase" {
    type = string
    default = ""
    }

    Then you can specify a value for your lzards_launchpad_passphrase in terraform.tfvars like so:

    lzards_launchpad_passphrase = your-passphrase

    Lastly, you need to make sure that the lzards_launchpad_passphrase is passed into the Cumulus module (in main.tf) like so:

    lzards_launchpad_passphrase  = var.lzards_launchpad_passphrase

    In short, deploying the LZARDS task requires configuring a passphrase variable and ensuring that your TF configuration passes that variable into the Cumulus module.

    Additional terraform configuration for the LZARDS task can be found in the cumulus module's variables.tf file, where the the relevant variables are prefixed with lzards_. You can add these variables to your deployment using the same process outlined above for lzards_launchpad_passphrase.

    Task Inputs

    Input

    This task expects an array of granules as input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Task Outputs

    Output

    The LZARDS task outputs a composite object containing:

    • the input granules array, and
    • a backupResults object that describes the results of LZARDS backup attempts.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflow_tasks/move_granules/index.html b/docs/v11.0.0/workflow_tasks/move_granules/index.html index d01f745c654..52c51138f1d 100644 --- a/docs/v11.0.0/workflow_tasks/move_granules/index.html +++ b/docs/v11.0.0/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v11.0.0

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflow_tasks/parse_pdr/index.html b/docs/v11.0.0/workflow_tasks/parse_pdr/index.html index b3460aa4bf6..a79010ffe02 100644 --- a/docs/v11.0.0/workflow_tasks/parse_pdr/index.html +++ b/docs/v11.0.0/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v11.0.0

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflow_tasks/queue_granules/index.html b/docs/v11.0.0/workflow_tasks/queue_granules/index.html index 332e8c0732f..b8e0cc9570e 100644 --- a/docs/v11.0.0/workflow_tasks/queue_granules/index.html +++ b/docs/v11.0.0/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: v11.0.0

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflows/cumulus-task-message-flow/index.html b/docs/v11.0.0/workflows/cumulus-task-message-flow/index.html index d597400dbb5..d99a42b9228 100644 --- a/docs/v11.0.0/workflows/cumulus-task-message-flow/index.html +++ b/docs/v11.0.0/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v11.0.0

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflows/developing-a-cumulus-workflow/index.html b/docs/v11.0.0/workflows/developing-a-cumulus-workflow/index.html index 7a5c8cece1f..537b35f47a9 100644 --- a/docs/v11.0.0/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/v11.0.0/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v11.0.0

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflows/developing-workflow-tasks/index.html b/docs/v11.0.0/workflows/developing-workflow-tasks/index.html index 5a522304d14..3db3e99f443 100644 --- a/docs/v11.0.0/workflows/developing-workflow-tasks/index.html +++ b/docs/v11.0.0/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v11.0.0

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflows/docker/index.html b/docs/v11.0.0/workflows/docker/index.html index 9d01044fa06..caeb4a02311 100644 --- a/docs/v11.0.0/workflows/docker/index.html +++ b/docs/v11.0.0/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflows/index.html b/docs/v11.0.0/workflows/index.html index dda680e497c..07238760fbe 100644 --- a/docs/v11.0.0/workflows/index.html +++ b/docs/v11.0.0/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v11.0.0

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflows/input_output/index.html b/docs/v11.0.0/workflows/input_output/index.html index aaef66a0513..fb4608cd3d6 100644 --- a/docs/v11.0.0/workflows/input_output/index.html +++ b/docs/v11.0.0/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v11.0.0

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflows/lambda/index.html b/docs/v11.0.0/workflows/lambda/index.html index 923755a8b51..4032463d71d 100644 --- a/docs/v11.0.0/workflows/lambda/index.html +++ b/docs/v11.0.0/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v11.0.0

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflows/protocol/index.html b/docs/v11.0.0/workflows/protocol/index.html index 116f6be944a..2e543ff069f 100644 --- a/docs/v11.0.0/workflows/protocol/index.html +++ b/docs/v11.0.0/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v11.0.0

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflows/workflow-configuration-how-to/index.html b/docs/v11.0.0/workflows/workflow-configuration-how-to/index.html index 8015a5f4e06..d1c4b0f4e7a 100644 --- a/docs/v11.0.0/workflows/workflow-configuration-how-to/index.html +++ b/docs/v11.0.0/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -24,7 +24,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.fileName, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/v11.0.0/workflows/workflow-triggers/index.html b/docs/v11.0.0/workflows/workflow-triggers/index.html index 22e2fd70c3e..7538a5ca5bc 100644 --- a/docs/v11.0.0/workflows/workflow-triggers/index.html +++ b/docs/v11.0.0/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v11.0.0

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/v11.1.0/adding-a-task/index.html b/docs/v11.1.0/adding-a-task/index.html index 12078bdf29b..ffe122bcfc1 100644 --- a/docs/v11.1.0/adding-a-task/index.html +++ b/docs/v11.1.0/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: v11.1.0

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/api/index.html b/docs/v11.1.0/api/index.html index 30b623534e4..a8b1208a68a 100644 --- a/docs/v11.1.0/api/index.html +++ b/docs/v11.1.0/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v11.1.0/architecture/index.html b/docs/v11.1.0/architecture/index.html index 837fe10bf78..a7f3fd764c6 100644 --- a/docs/v11.1.0/architecture/index.html +++ b/docs/v11.1.0/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: v11.1.0

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a set of PostgreSQL compatible database, and is exported to an Elasticsearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries. Currently the entity state data is replicated in DynamoDB and this will be removed in a future release.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Database

    Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance. For legacy reasons, Cumulus is currently also deploying a DynamoDB datastore, and writes are replicated in parallel with the PostgreSQL database writes. The DynamoDB replicated tables and parallel writes will be removed in future releases.

    PostgreSQL Database Schema Diagram

    ERD of the Cumulus Database

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/v11.1.0/configuration/cloudwatch-retention/index.html b/docs/v11.1.0/configuration/cloudwatch-retention/index.html index 3bbfc78f508..7a1064f6edd 100644 --- a/docs/v11.1.0/configuration/cloudwatch-retention/index.html +++ b/docs/v11.1.0/configuration/cloudwatch-retention/index.html @@ -5,13 +5,13 @@ Cloudwatch Retention | Cumulus Documentation - +
    Version: v11.1.0

    Cloudwatch Retention

    Our lambdas dump logs to AWS CloudWatch. By default, these logs exist indefinitely. However, there are ways to specify a duration for log retention.

    aws-cli

    In addition to getting your aws-cli set-up, there are two values you'll need to acquire.

    1. log-group-name: the name of the log group who's retention policy (retention time) you'd like to change. We'll use /aws/lambda/KinesisInboundLogger in our examples.
    2. retention-in-days: the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    For example, if we wanted to set log retention to 30 days on our KinesisInboundLogger lambda, we would write:

    aws logs put-retention-policy --log-group-name "/aws/lambda/KinesisInboundLogger" --retention-in-days 30

    Note: The aws-cli log command that we're using is explained in detail here.

    AWS Management Console

    Changing the log retention policy in the AWS Management Console is a fairly simple process:

    1. Navigate to the CloudWatch service in the AWS Management Console.
    2. Click on the Logs entry on the sidebar.
    3. Find the Log Group who's retention policy you're interested in changing.
    4. Click on the value in the Expire Events After column.
    5. Enter/Select the number of days you'd like to retain logs in that log group for.

    Screenshot of AWS console showing how to configure the retention period for Cloudwatch logs

    - + \ No newline at end of file diff --git a/docs/v11.1.0/configuration/collection-storage-best-practices/index.html b/docs/v11.1.0/configuration/collection-storage-best-practices/index.html index 26a7afc018e..95d2e7aa067 100644 --- a/docs/v11.1.0/configuration/collection-storage-best-practices/index.html +++ b/docs/v11.1.0/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: v11.1.0

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/v11.1.0/configuration/data-management-types/index.html b/docs/v11.1.0/configuration/data-management-types/index.html index 7e2a4bcc76f..ff9dcf2994a 100644 --- a/docs/v11.1.0/configuration/data-management-types/index.html +++ b/docs/v11.1.0/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: v11.1.0

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    Please note:

    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certificateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certiciateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22
    privateKeystringNofilename assumed to be in s3://bucketInternal/stackName/crypto
    cmKeyIdstringNoAWS KMS Customer Master Key arn or alias

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.fileName, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.fileName, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v11.1.0/configuration/lifecycle-policies/index.html b/docs/v11.1.0/configuration/lifecycle-policies/index.html index e6f3cf9bf2e..dc62f463e9a 100644 --- a/docs/v11.1.0/configuration/lifecycle-policies/index.html +++ b/docs/v11.1.0/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: v11.1.0

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walk-through on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/v11.1.0/configuration/monitoring-readme/index.html b/docs/v11.1.0/configuration/monitoring-readme/index.html index 946d722fb07..7362d10c414 100644 --- a/docs/v11.1.0/configuration/monitoring-readme/index.html +++ b/docs/v11.1.0/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: v11.1.0

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return await Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/configuration/server_access_logging/index.html b/docs/v11.1.0/configuration/server_access_logging/index.html index 86de7f9a346..bef51a3a975 100644 --- a/docs/v11.1.0/configuration/server_access_logging/index.html +++ b/docs/v11.1.0/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: v11.1.0

    S3 Server Access Logging

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/v11.1.0/configuration/task-configuration/index.html b/docs/v11.1.0/configuration/task-configuration/index.html index 187f338ffde..3df62bea9cc 100644 --- a/docs/v11.1.0/configuration/task-configuration/index.html +++ b/docs/v11.1.0/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
    Version: v11.1.0

    Configuration of Tasks

    The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

    cmr_search_client_config

    Configuration parameters for CMR search client for cumulus archive module tasks in the form:

    <lambda_identifier>_report_cmr_limit = <maximum number records can be returned from cmr-client search, this should be greater than cmr_page_size>
    <lambda_identifier>_report_cmr_page_size = <number of records for each page returned from CMR>
    type = map(string)

    More information about cmr limit and cmr page_size can be found from @cumulus/cmr-client and CMR Search API document.

    Currently the following values are supported:

    • create_reconciliation_report_cmr_limit
    • create_reconciliation_report_cmr_page_size

    Example

    cmr_search_client_config = {
    create_reconciliation_report_cmr_limit = 2500
    create_reconciliation_report_cmr_page_size = 250
    }

    elasticsearch_client_config

    Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

    <lambda_identifier>_es_scroll_duration = <duration>
    <lambda_identifier>_es_scroll_size = <size>
    type = map(string)

    Currently the following values are supported:

    • create_reconciliation_report_es_scroll_duration
    • create_reconciliation_report_es_scroll_size

    Example

    elasticsearch_client_config = {
    create_reconciliation_report_es_scroll_duration = "15m"
    create_reconciliation_report_es_scroll_size = 2000
    }

    lambda_timeouts

    A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_timeout: <timeout>
    type = map(string)

    Currently the following values are supported:

    • discover_granules_task_timeout
    • discover_pdrs_task_timeout
    • hyrax_metadata_update_tasks_timeout
    • lzards_backup_task_timeout
    • move_granules_task_timeout
    • parse_pdr_task_timeout
    • pdr_status_check_task_timeout
    • post_to_cmr_task_timeout
    • queue_granules_task_timeout
    • queue_pdrs_task_timeout
    • queue_workflow_task_timeout
    • sync_granule_task_timeout
    • update_granules_cmr_metadata_file_links_task_timeout

    Example

    lambda_timeouts = {
    discover_granules_task_timeout = 300
    }
    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/about-cookbooks/index.html b/docs/v11.1.0/data-cookbooks/about-cookbooks/index.html index ec1baa9f428..e93be6a1152 100644 --- a/docs/v11.1.0/data-cookbooks/about-cookbooks/index.html +++ b/docs/v11.1.0/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: v11.1.0

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to configure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/browse-generation/index.html b/docs/v11.1.0/data-cookbooks/browse-generation/index.html index 02e16397732..87d3e303fc6 100644 --- a/docs/v11.1.0/data-cookbooks/browse-generation/index.html +++ b/docs/v11.1.0/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg"
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/choice-states/index.html b/docs/v11.1.0/data-cookbooks/choice-states/index.html index f8d49c6a2f5..0ff3288545d 100644 --- a/docs/v11.1.0/data-cookbooks/choice-states/index.html +++ b/docs/v11.1.0/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: v11.1.0

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/cnm-workflow/index.html b/docs/v11.1.0/data-cookbooks/cnm-workflow/index.html index b0347af0097..cc79940aeb5 100644 --- a/docs/v11.1.0/data-cookbooks/cnm-workflow/index.html +++ b/docs/v11.1.0/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v11.1.0

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/kinesis_trigger_test_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }

    Please Note:

    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

    Please note: The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/error-handling/index.html b/docs/v11.1.0/data-cookbooks/error-handling/index.html index 8a042ec8dbb..9dbf2e74372 100644 --- a/docs/v11.1.0/data-cookbooks/error-handling/index.html +++ b/docs/v11.1.0/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/hello-world/index.html b/docs/v11.1.0/data-cookbooks/hello-world/index.html index 8b0da61c861..610d1387c36 100644 --- a/docs/v11.1.0/data-cookbooks/hello-world/index.html +++ b/docs/v11.1.0/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: v11.1.0

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/ingest-notifications/index.html b/docs/v11.1.0/data-cookbooks/ingest-notifications/index.html index fa9fa96f53f..92738b4fcbe 100644 --- a/docs/v11.1.0/data-cookbooks/ingest-notifications/index.html +++ b/docs/v11.1.0/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: v11.1.0

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics, one for executions, granules, and PDRs, are created and used for handling notification messages related to the workflow.

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates the RDS database records for granules, executions, and PDRs. When the records are updated, messages are posted to the three SNS topics. This Lambda is invoked both when the workflow starts and when it reaches a terminal state (completion or failure).

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/queue-post-to-cmr/index.html b/docs/v11.1.0/data-cookbooks/queue-post-to-cmr/index.html index 11d840405e4..781ab28c748 100644 --- a/docs/v11.1.0/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/v11.1.0/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: v11.1.0

    Queue PostToCmr

    In this document, we walk through handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/v11.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index f5bcb97c62e..9057edb4e29 100644 --- a/docs/v11.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/v11.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: v11.1.0

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.7.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn,
    "--lastModified",
    module.cumulus.queue_granules_task.last_modified_date
    ]
    alarms = {
    MemoryUtilizationHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 75
    }
    }
    }

    Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/sips-workflow/index.html b/docs/v11.1.0/data-cookbooks/sips-workflow/index.html index b7c1d3079c8..9c508e5b0e1 100644 --- a/docs/v11.1.0/data-cookbooks/sips-workflow/index.html +++ b/docs/v11.1.0/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/throttling-queued-executions/index.html b/docs/v11.1.0/data-cookbooks/throttling-queued-executions/index.html index d1c818f95d1..9de6cd54023 100644 --- a/docs/v11.1.0/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/v11.1.0/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: v11.1.0

    Throttling queued executions

    In this entry, we will walk through how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/v11.1.0/data-cookbooks/tracking-files/index.html b/docs/v11.1.0/data-cookbooks/tracking-files/index.html index 672c3b5796c..6c442af6e13 100644 --- a/docs/v11.1.0/data-cookbooks/tracking-files/index.html +++ b/docs/v11.1.0/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/api-gateway-logging/index.html b/docs/v11.1.0/deployment/api-gateway-logging/index.html index 79126fab95f..15da0601e74 100644 --- a/docs/v11.1.0/deployment/api-gateway-logging/index.html +++ b/docs/v11.1.0/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: v11.1.0

    API Gateway Logging

    Enabling API Gateway logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions for enabling account level logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    Create a policy document

    The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

    Save this snippet as apigateway-policy.json.

    {
    "Version": "2012-10-17",
    "Statement": [
    {
    "Sid": "",
    "Effect": "Allow",
    "Principal": {
    "Service": "apigateway.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
    }
    ]
    }

    Create an account role to act as ApiGateway and write to CloudWatchLogs

    NASA users in NGAP: be sure to use your account's permission boundary.

    aws iam create-role \
    --role-name ApiGatewayToCloudWatchLogs \
    [--permissions-boundary <permissionBoundaryArn>] \
    --assume-role-policy-document file://apigateway-policy.json

    Note the ARN of the returned role for the last step.

    Attach correct permissions to role

    Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

    aws iam attach-role-policy \
    --role-name ApiGatewayToCloudWatchLogs \
    --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"

    Update Account API Gateway settings with correct permissions

    Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

    aws apigateway update-account \
    --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    See Configure Cloudwatch Logs Delivery

    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/choosing_configuring_rds/index.html b/docs/v11.1.0/deployment/choosing_configuring_rds/index.html index d6a37182a9f..377121b134a 100644 --- a/docs/v11.1.0/deployment/choosing_configuring_rds/index.html +++ b/docs/v11.1.0/deployment/choosing_configuring_rds/index.html @@ -5,7 +5,7 @@ Choosing and configuration your RDS database | Cumulus Documentation - + @@ -37,7 +37,7 @@ using this module to create your RDS cluster, you can configure the autoscaling timeout action, the cluster minimum and maximum capacity, and more as seen in the supported variables for the module.

    Unfortunately, Terraform currently doesn't allow specifying the autoscaling timeout itself, so that value will have to be manually configured in the AWS console or CLI.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/cloudwatch-logs-delivery/index.html b/docs/v11.1.0/deployment/cloudwatch-logs-delivery/index.html index f3fd4969302..10383d015e0 100644 --- a/docs/v11.1.0/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/v11.1.0/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: v11.1.0

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    Be default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/components/index.html b/docs/v11.1.0/deployment/components/index.html index a9cda275eba..5814d69088d 100644 --- a/docs/v11.1.0/deployment/components/index.html +++ b/docs/v11.1.0/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walk-through of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/create_bucket/index.html b/docs/v11.1.0/deployment/create_bucket/index.html index e7f216414f6..24353781ed5 100644 --- a/docs/v11.1.0/deployment/create_bucket/index.html +++ b/docs/v11.1.0/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: v11.1.0

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command line

    Using the AWS command line tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }

    Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web interface

    See: AWS "Creating a Bucket" documentation

    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/cumulus_distribution/index.html b/docs/v11.1.0/deployment/cumulus_distribution/index.html index 746187a1581..1fc3c73f2a6 100644 --- a/docs/v11.1.0/deployment/cumulus_distribution/index.html +++ b/docs/v11.1.0/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
    Version: v11.1.0

    Using the Cumulus Distribution API

    The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

    Configuring a Cumulus Distribution deployment

    The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

    These steps assume you're using the Cumulus Deployment Template but can also be used for custom deployments.

    To configure a deployment to use Cumulus Distribution:

    1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the Cumulus Distribution settings.
    2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
    3. Uncomment the Cumulus Distribution outputs in outputs.tf.
    4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

    Cognito Application and User Credentials

    The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

    Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

    Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

    • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
    • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
    • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

    Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

    Cumulus Distribution URL

    Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

    By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

    In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

    You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file, and setting it to one of the following values (both of which are explained below):

    1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
    2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

    The following subsections explain these approaches, in turn.

    Using your Cumulus Distribution API Gateway URL as your distribution URL

    Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development), as described here. Here is an outline of the required steps, with details provided further below:

    1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
    2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
    3. Choose an open local port on your machine (we'll use 9000 in the following details)
    4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
    5. Redeploy Cumulus
    6. Add an entry to your /etc/hosts file
    7. Add a redirect URI to Cognito, via the Cognito API
    8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
    9. Add a sample file to S3 to test downloading via Cognito

    To create or import an existing key pair, you can use the AWS CLI (see aws ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

    Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

    key_name = "<name>"
    cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

    where:

    • <name> is the name of the key pair you just added to AWS
    • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
    • <port> is your open local port of choice (9000 is typically a good choice)

    Once you save your variable changes, redeploy your cumulus-tf module.

    While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

    localhost <hostname>

    Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

    • method: POST
    • base URL: the value of your csdap_host_url Terraform variable
    • path: /authclient/updateRedirectUri
    • username: the value of your csdap_client_id Terraform variable
    • password: the value of your csdap_client_password Terraform variable
    • headers: Content-Type='application/x-www-form-urlencoded'
    • body: redirect_uri=<cumulus_distribution_url>/login

    where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

    For reference, see the Cognito Authentication Service API.

    Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

    brew install --cask session-manager-plugin --no-quarantine

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

    1. Determine your ec2 instance ID
    2. Connect to the NASA VPN
    3. Start an AWS SSM session
    4. Open an ssh tunnel
    5. Use a browser to navigate to your file

    To determine your ec2 instance ID for your Cumulus deployment, run the follow command, where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

    aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text

    IMPORTANT: Before proceeding with the remaining steps, make sure you're connected to the NASA VPN.

    Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

    aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

    If successful, you should see output similar to the following:

    Starting session with SessionId: NGAPShApplicationDeveloper-***
    Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
    Waiting for connections...

    Open another terminal window, and open a tunnel with port forwarding, using your chosen port from above (e.g., 9000):

    ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

    where:

    • <port> is the open local port you chose earlier (e.g., 9000)
    • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    Once you're finished testing, clean up as follows:

    1. Kill your ssh tunnel (Ctrl-C)
    2. Kill your AWS SSM session (Ctrl-C)
    3. If you like, disconnect from the NASA VPC

    While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

    Using a CloudFront URL as your distribution URL

    In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

    This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

    • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
    • A request to add the AWS account's VPC to the whitelist

    Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

    cumulus_distribution_url = <cloudfront_url>

    In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

    At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

    Redeploy Cumulus with your new/updated Terraform variables.

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    S3 Bucket Mapping

    An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

    The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Note: Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Switching from the Thin Egress App to Cumulus Distribution

    If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

    Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

    There will also be downtime while your API gateway is updated.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/index.html b/docs/v11.1.0/deployment/index.html index fad8faa605e..c29f52a3413 100644 --- a/docs/v11.1.0/deployment/index.html +++ b/docs/v11.1.0/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -21,7 +21,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Choose a distribution API

    Cumulus can be configured to use either the Thin Egress App (TEA) or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

    IMPORTANT! If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    Configure the Thin Egress App

    The Thin Egress App can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the documentation to configure distribution in your cumulus-tf deployment.

    Configure the Cumulus Distribution API (optional)

    If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

    Note: Be sure to copy the redirect URLs, as you will use them to update your Earthdata application.

    Update Earthdata Application

    You will need to add two redirect URLs to your EarthData login application.

    1. Login to URS.
    2. Under My Applications -> Application Administration -> use the edit icon of your application.
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token.
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1.
    5. You may delete the placeholder url you used to create the application.

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus dashboard

    Dashboard Requirements

    Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the dashboard repository.

    Prepare AWS

    Create S3 bucket for dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install dashboard

    To install the dashboard, clone the Cumulus dashboard repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard versioning

    By default, the master branch will be used for dashboard deployments. The master branch of the dashboard repo contains the most recent stable release of the dashboard.

    If you want to test unreleased changes to the dashboard, use the develop branch.

    Each release/version of the dashboard will have a tag in the dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the dashboard

    Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build the dashboard from the dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard deployment

    Deploy dashboard to s3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard --acl public-read

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and login with a user that you configured for access in the Configure and Deploy the Cumulus Stack step.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 instances and autoscaling groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/postgres_database_deployment/index.html b/docs/v11.1.0/deployment/postgres_database_deployment/index.html index 39c54526911..2f336ad6414 100644 --- a/docs/v11.1.0/deployment/postgres_database_deployment/index.html +++ b/docs/v11.1.0/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 10.2 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases, and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs, and the Aurora Serverless V1 docs.

    Prepare deployment repository

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a new repository

    Create a new repository on Github so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

    --

    Configure and deploy the module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc. This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision user and user database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true, this configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to
    • prefix -- this value will be used to set a unique identifier the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda, and run it on each provision, creating the configured database if it does not exist, updating the user password if that value has been changed, and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    Please Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see output like:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secret required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are is in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Next Steps

    The database cluster has been created/updated! From here you can continue to add additional user accounts, databases and other database configuration.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/share-s3-access-logs/index.html b/docs/v11.1.0/deployment/share-s3-access-logs/index.html index 035c215ebdf..e55c540b5ae 100644 --- a/docs/v11.1.0/deployment/share-s3-access-logs/index.html +++ b/docs/v11.1.0/deployment/share-s3-access-logs/index.html @@ -5,14 +5,14 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: v11.1.0

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a node package that contains a simple lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First ensure that you have enabled S3 Server Access Logging.

    Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition. e.g.

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The terraform source package can be found on the Cumulus github release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 replicator as described above using the target_bucket and target_prefix provided by the metrics team.

    The metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/terraform-best-practices/index.html b/docs/v11.1.0/deployment/terraform-best-practices/index.html index ba0dd5b088f..24b9f329c4b 100644 --- a/docs/v11.1.0/deployment/terraform-best-practices/index.html +++ b/docs/v11.1.0/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    Configuring the Cumulus deployment: link Restoring a previous version: link

    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/thin_egress_app/index.html b/docs/v11.1.0/deployment/thin_egress_app/index.html index 823f9dd3dd5..16732d1f8e6 100644 --- a/docs/v11.1.0/deployment/thin_egress_app/index.html +++ b/docs/v11.1.0/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v11.1.0

    Using the Thin Egress App for Cumulus distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a secret for signing Thin Egress App JWTs

    The Thin Egress App uses JWTs internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Please note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally configure a custom bucket map

    A simple config would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public

    Please note: your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally configure shared variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus core example deployment code.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/deployment/upgrade-readme/index.html b/docs/v11.1.0/deployment/upgrade-readme/index.html index d39b1796786..c6bd4d8c27f 100644 --- a/docs/v11.1.0/deployment/upgrade-readme/index.html +++ b/docs/v11.1.0/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/development/forked-pr/index.html b/docs/v11.1.0/development/forked-pr/index.html index 9ae87472aec..65cbedc5c01 100644 --- a/docs/v11.1.0/development/forked-pr/index.html +++ b/docs/v11.1.0/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: v11.1.0

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/development/integration-tests/index.html b/docs/v11.1.0/development/integration-tests/index.html index d0b7d566631..05bf8119308 100644 --- a/docs/v11.1.0/development/integration-tests/index.html +++ b/docs/v11.1.0/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/development/quality-and-coverage/index.html b/docs/v11.1.0/development/quality-and-coverage/index.html index 0fd71e77472..6eab1aaa40d 100644 --- a/docs/v11.1.0/development/quality-and-coverage/index.html +++ b/docs/v11.1.0/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/development/release/index.html b/docs/v11.1.0/development/release/index.html index f575dc1a51f..86d564e9d9c 100644 --- a/docs/v11.1.0/development/release/index.html +++ b/docs/v11.1.0/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -15,7 +15,7 @@ It's useful to use the search feature of your code editor or grep to see if there any references to the old package versions. In bash shell you can run

    find . -name package.json -exec grep -nH "@cumulus/.*MAJOR\.MINOR\.PATCH.*" {} \;

    Verify that each of those is updated to the new MAJOR.MINOR.PATCH verion you are trying to release.

    A similar search for alpha and beta versions should be run on the release version and any problems should be fixed.

    find . -name package.json -exec grep -nHE "MAJOR\.MINOR\.PATCH.*(alpha|beta)" {} \;

    3. Check Cumulus Dashboard PRs for Version Bump

    There may be unreleased changes in the Cumulus Dashboard project that rely on this unreleased Cumulus Core version.

    If there is exists a PR in the cumulus-dashboard repo with a name containing: "Version Bump for Next Cumulus API Release":

    • There will be a placeholder change-me value that should be replaced with the Cumulus Core to-be-released-version.
    • Mark that PR as ready to be reviewed.

    4. Update CHANGELOG.md

    Update the CHANGELOG.md. Put a header under the Unreleased section with the new version number and the date.

    Add a link reference for the github "compare" view at the bottom of the CHANGELOG.md, following the existing pattern. This link reference should create a link in the CHANGELOG's release header to changes in the corresponding release.

    5. Update DATA_MODEL_CHANGELOG.md

    Similar to #4, make sure the DATA_MODEL_CHANGELOG is updated if there are data model changes in the release, and the link reference at the end of the document is updated as appropriate.

    6. Update CONTRIBUTORS.md

    ./bin/update-contributors.sh
    git add CONTRIBUTORS.md

    Commit and push these changes, if any.

    7. Update Cumulus package API documentation

    Update auto-generated API documentation for any Cumulus packages that have it:

    npm run docs-build-packages

    Commit and push these changes, if any.

    8. Cut new version of Cumulus Documentation

    If this is a backport, do not create a new version of the documentation. For various reasons, we do not merge backports back to master, other than changelog notes. Documentation changes for backports will not be published to our documentation website.

    cd website
    npm run version ${release_version}
    git add .

    Where ${release_version} corresponds to the version tag v1.2.3, for example.

    Commit and push these changes.

    9. Create a pull request against the minor version branch

    1. Push the release branch (e.g. release-1.2.3) to GitHub.

    2. Create a PR against the minor version base branch (e.g. release-1.2.x).

    3. Configure Bamboo to run automated tests against this PR by finding the branch plan for the release branch (release-1.2.3) and setting only these variables:

      • GIT_PR: true
      • SKIP_AUDIT: true

      IMPORTANT: Do NOT set the PUBLISH_FLAG variable to true for this branch plan. The actual publishing of the release will be handled by a separate, manually triggered branch plan.

      Screenshot of Bamboo CI interface showing the configuration of the GIT_PR branch variable to have a value of &quot;true&quot;

    4. Verify that the Bamboo build for the PR succeeds and then merge to the minor version base branch (release-1.2.x).

      • It is safe to do a squash merge in this instance, but not required
    5. You may delete your release branch (release-1.2.3) after merging to the base branch.

    10. Create a git tag for the release

    Check out the minor version base branch (release-1.2.x) now that your changes are merged in and do a git pull.

    Ensure you are on the latest commit.

    Create and push a new git tag:

        git tag -a vMAJOR.MINOR.PATCH -m "Release MAJOR.MINOR.PATCH"
    git push origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -a v9.1.0 -m "Release 9.1.0"
    git push origin v9.1.0

    11. Publishing the release

    Publishing of new releases is handled by a custom Bamboo branch plan and is manually triggered.

    The reasons for using a separate branch plan to handle releases instead of the branch plan for the minor version (e.g. release-1.2.x) are:

    • The Bamboo build for the minor version release branch is triggered automatically on any commits to that branch, whereas we want to manually control when the release is published.
    • We want to verify that integration tests have passed on the Bamboo build for the minor version release branch before we manually trigger the release, so that we can be sure that our code is safe to release.

    If this is a new minor version branch, then you will need to create a new Bamboo branch plan for publishing the release following the instructions below:

    Creating a Bamboo branch plan for the release

    • In the Cumulus Core project (https://ci.earthdata.nasa.gov/browse/CUM-CBA), click Actions -> Configure Plan in the top right.

    • Next to Plan branch click the rightmost button that displays Create Plan Branch upon hover.

    • Click Create plan branch manually.

    • Add the values in that list. Choose a display name that makes it very clear this is a deployment branch plan. Release (minor version branch name) seems to work well (e.g. Release (1.2.x))).

      • Make sure you enter the correct branch name (e.g. release-1.2.x).
    • Important Deselect Enable Branch - if you do not do this, it will immediately fire off a build.

    • Do Immediately On the Branch Details page, enable Change trigger. Set the Trigger type to manual, this will prevent commits to the branch from triggering the build plan. You should have been redirected to the Branch Details tab after creating the plan. If not, navigate to the branch from the list where you clicked Create Plan Branch in the previous step.

    • Go to the Variables tab. Ensure that you are on your branch plan and not the master plan: You should not see a large list of configured variables, but instead a dropdown allowing you to select variables to override, and the tab title will be Branch Variables. Then set the branch variables as follow:

      • DEPLOYMENT: cumulus-from-npm-tf (except in special cases such as incompatible backport branches)
        • If this variable is not set, it will default to the deployment name for the last committer on the branch
      • USE_CACHED_BOOTSTRAP: false
      • USE_TERRAFORM_ZIPS: true (IMPORTANT: MUST be set in order to run integration tests against the .zip files published during the build so that we are actually testing our released files)
      • GIT_PR: true
      • SKIP_AUDIT: true
      • PUBLISH_FLAG: true
    • Enable the branch from the Branch Details page.

    • Run the branch using the Run button in the top right.

    Bamboo will build and run lint and unit tests against that tagged release, publish the new packages to NPM, and then run the integration tests using those newly released packages.

    12. Create a new Cumulus release on github

    The CI release scripts will automatically create a GitHub release based on the release version tag, as well as upload artifacts to the Github release for the Terraform modules provided by Cumulus. The Terraform release artifacts include:

    • A multi-module Terraform .zip artifact containing filtered copies of the tf-modules, packages, and tasks directories for use as Terraform module sources.
    • A S3 replicator module
    • A workflow module
    • A distribution API module
    • An ECS service module

    Just make sure to verify the appropriate .zip files are present on Github after the release process is complete.

    13. Merge base branch back to master

    Finally, you need to reproduce the version update changes back to master.

    If this is the latest version, you can simply create a PR to merge the minor version base branch back to master.

    Do not merge master back into the release branch since we want the release branch to just have the code from the release. Instead, create a new branch off of the release branch and merge that to master. You can freely merge master into this branch and delete it when it is merged to master.

    If this is a backport, you will need to create a PR that ports the changelog updates back to master. It is important in this changelog note to call it out as a backport. For example, fixes in backport version 1.14.5 may not be available in 1.15.0 because the fix was introduced in 1.15.3.

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d vMAJOR.MINOR.PATCH
    git push -d origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -d v9.1.0
    git push -d origin v9.1.0
    - + \ No newline at end of file diff --git a/docs/v11.1.0/docs-how-to/index.html b/docs/v11.1.0/docs-how-to/index.html index 94a1980ee6d..7db53292149 100644 --- a/docs/v11.1.0/docs-how-to/index.html +++ b/docs/v11.1.0/docs-how-to/index.html @@ -5,13 +5,13 @@ Cumulus Documentation: How To's | Cumulus Documentation - +
    Version: v11.1.0

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve

    Note: docs-build will build the documents into website/build.

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---

    Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walk-through can be found here. It is worth noting that we would like the Documentation versions to match up directly with release versions. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiKey and an indexName by DocSearch that we include in our website/siteConfig.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for both these values to exist - DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/external-contributions/index.html b/docs/v11.1.0/external-contributions/index.html index a69a76feca0..b6c7861ad29 100644 --- a/docs/v11.1.0/external-contributions/index.html +++ b/docs/v11.1.0/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: v11.1.0

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/faqs/index.html b/docs/v11.1.0/faqs/index.html index 192473d881c..b452cded465 100644 --- a/docs/v11.1.0/faqs/index.html +++ b/docs/v11.1.0/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: v11.1.0

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    What prerequisites are needed to setup Cumulus?

    Answer: You will need access to the AWS console and an Earthdata login before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a JIRA issue.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    Is Cumulus customizable and flexible?

    Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

    What are Terraform modules?

    Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

    Where do I find Terraform module variables

    Answer: Go here for a list of Cumulus maintained variables.

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting here.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/features/ancillary_metadata/index.html b/docs/v11.1.0/features/ancillary_metadata/index.html index c4e030274d9..4076834e4b8 100644 --- a/docs/v11.1.0/features/ancillary_metadata/index.html +++ b/docs/v11.1.0/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v11.1.0

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/features/backup_and_restore/index.html b/docs/v11.1.0/features/backup_and_restore/index.html index 133821d88a6..61159a05b6a 100644 --- a/docs/v11.1.0/features/backup_and_restore/index.html +++ b/docs/v11.1.0/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -52,7 +52,7 @@ writing to the old cluster.

  • Set the snapshot_identifier variable to the snapshot you wish to create, and configure the module like a new deployment, with a unique cluster_identifier

  • Deploy the module using terraform apply

  • Once deployed, verify the cluster has the expected data

  • Redeploy the data persistence and Cumulus deployments - You should not need to reconfigure either, as the secret ARN and the security group should not change, however double-check the configured values are as expected

  • - + \ No newline at end of file diff --git a/docs/v11.1.0/features/dead_letter_archive/index.html b/docs/v11.1.0/features/dead_letter_archive/index.html index 3139f7ad649..b1d508b37de 100644 --- a/docs/v11.1.0/features/dead_letter_archive/index.html +++ b/docs/v11.1.0/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: v11.1.0

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/features/dead_letter_queues/index.html b/docs/v11.1.0/features/dead_letter_queues/index.html index 1f892c440ae..0fa5fe5caa3 100644 --- a/docs/v11.1.0/features/dead_letter_queues/index.html +++ b/docs/v11.1.0/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v11.1.0

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/v11.1.0/features/distribution-metrics/index.html b/docs/v11.1.0/features/distribution-metrics/index.html index 399026278d1..81a9386ab05 100644 --- a/docs/v11.1.0/features/distribution-metrics/index.html +++ b/docs/v11.1.0/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v11.1.0

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/v11.1.0/features/execution_payload_retention/index.html b/docs/v11.1.0/features/execution_payload_retention/index.html index 0114ba6687c..4db19b62034 100644 --- a/docs/v11.1.0/features/execution_payload_retention/index.html +++ b/docs/v11.1.0/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v11.1.0

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in your RDS database and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/features/logging-esdis-metrics/index.html b/docs/v11.1.0/features/logging-esdis-metrics/index.html index 1369102733c..fe2774dec03 100644 --- a/docs/v11.1.0/features/logging-esdis-metrics/index.html +++ b/docs/v11.1.0/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v11.1.0

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/features/replay-archived-sqs-messages/index.html b/docs/v11.1.0/features/replay-archived-sqs-messages/index.html index 49eee7e3bf2..9cb6c8b1581 100644 --- a/docs/v11.1.0/features/replay-archived-sqs-messages/index.html +++ b/docs/v11.1.0/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: v11.1.0

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/features/replay-kinesis-messages/index.html b/docs/v11.1.0/features/replay-kinesis-messages/index.html index a9612818412..666c08630ee 100644 --- a/docs/v11.1.0/features/replay-kinesis-messages/index.html +++ b/docs/v11.1.0/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v11.1.0

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/features/reports/index.html b/docs/v11.1.0/features/reports/index.html index ff7227cdbb5..f839a5b613b 100644 --- a/docs/v11.1.0/features/reports/index.html +++ b/docs/v11.1.0/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -19,7 +19,7 @@ report generation. The data buckets will include any buckets in your Cumulus buckets configuration that have type public, protected or private.
    - + \ No newline at end of file diff --git a/docs/v11.1.0/getting-started/index.html b/docs/v11.1.0/getting-started/index.html index 7f7490e27b6..9c14921b2d8 100644 --- a/docs/v11.1.0/getting-started/index.html +++ b/docs/v11.1.0/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v11.1.0

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Deploy Cumulus and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    2. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    3. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment. For an introduction about Terraform go here.
    • Integrator Common Use Cases: Scenarios to help integrators along in the Cumulus environment.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/v11.1.0/glossary/index.html b/docs/v11.1.0/glossary/index.html index a0e4d3b3d26..2ba0b231339 100644 --- a/docs/v11.1.0/glossary/index.html +++ b/docs/v11.1.0/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: v11.1.0

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: aws.amazon.com

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see cookbook reference page.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Npm

    Node package manager.

    For more information, see npmjs.com.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (NPM)

    NPM hosted node.js packages. Cumulus packages can be found on NPM's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data and more.

    For more information, see AWS's s3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/index.html b/docs/v11.1.0/index.html index 6e43d681bb9..708710f3a42 100644 --- a/docs/v11.1.0/index.html +++ b/docs/v11.1.0/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v11.1.0

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/integrator-guide/about-int-guide/index.html b/docs/v11.1.0/integrator-guide/about-int-guide/index.html index 4d5f7be1fde..ce3af48808b 100644 --- a/docs/v11.1.0/integrator-guide/about-int-guide/index.html +++ b/docs/v11.1.0/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v11.1.0

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/v11.1.0/integrator-guide/int-common-use-cases/index.html b/docs/v11.1.0/integrator-guide/int-common-use-cases/index.html index c855418370a..a5e4db9b70f 100644 --- a/docs/v11.1.0/integrator-guide/int-common-use-cases/index.html +++ b/docs/v11.1.0/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v11.1.0/integrator-guide/workflow-add-new-lambda/index.html b/docs/v11.1.0/integrator-guide/workflow-add-new-lambda/index.html index 40073bda5f4..4c1151d0c5c 100644 --- a/docs/v11.1.0/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/v11.1.0/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v11.1.0

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/integrator-guide/workflow-ts-failed-step/index.html b/docs/v11.1.0/integrator-guide/workflow-ts-failed-step/index.html index 2ef86cf3f0d..d9699228716 100644 --- a/docs/v11.1.0/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/v11.1.0/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v11.1.0

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/v11.1.0/interfaces/index.html b/docs/v11.1.0/interfaces/index.html index b8bebf03801..049c9f829cb 100644 --- a/docs/v11.1.0/interfaces/index.html +++ b/docs/v11.1.0/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v11.1.0

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/about-operator-docs/index.html b/docs/v11.1.0/operator-docs/about-operator-docs/index.html index 65efb6018ec..f113a7c8f71 100644 --- a/docs/v11.1.0/operator-docs/about-operator-docs/index.html +++ b/docs/v11.1.0/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v11.1.0

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/bulk-operations/index.html b/docs/v11.1.0/operator-docs/bulk-operations/index.html index fac3dad0d64..cbbbb2b2293 100644 --- a/docs/v11.1.0/operator-docs/bulk-operations/index.html +++ b/docs/v11.1.0/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v11.1.0

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/cmr-operations/index.html b/docs/v11.1.0/operator-docs/cmr-operations/index.html index 8218a16e718..0fac0980262 100644 --- a/docs/v11.1.0/operator-docs/cmr-operations/index.html +++ b/docs/v11.1.0/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/create-rule-in-cumulus/index.html b/docs/v11.1.0/operator-docs/create-rule-in-cumulus/index.html index 7c045252468..bb2773f937e 100644 --- a/docs/v11.1.0/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/v11.1.0/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v11.1.0

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/discovery-filtering/index.html b/docs/v11.1.0/operator-docs/discovery-filtering/index.html index a0964db472d..f40119d405e 100644 --- a/docs/v11.1.0/operator-docs/discovery-filtering/index.html +++ b/docs/v11.1.0/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/granule-workflows/index.html b/docs/v11.1.0/operator-docs/granule-workflows/index.html index 1f5e20d503a..c4ab521cd54 100644 --- a/docs/v11.1.0/operator-docs/granule-workflows/index.html +++ b/docs/v11.1.0/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v11.1.0

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/kinesis-stream-for-ingest/index.html b/docs/v11.1.0/operator-docs/kinesis-stream-for-ingest/index.html index 4805eb1f68a..99a3f4a97a0 100644 --- a/docs/v11.1.0/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/v11.1.0/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v11.1.0

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/locating-access-logs/index.html b/docs/v11.1.0/operator-docs/locating-access-logs/index.html index 77344b9d389..56bd1ad2323 100644 --- a/docs/v11.1.0/operator-docs/locating-access-logs/index.html +++ b/docs/v11.1.0/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v11.1.0

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/naming-executions/index.html b/docs/v11.1.0/operator-docs/naming-executions/index.html index b6f30e76492..2fdbeab78e3 100644 --- a/docs/v11.1.0/operator-docs/naming-executions/index.html +++ b/docs/v11.1.0/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Please note: This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/ops-common-use-cases/index.html b/docs/v11.1.0/operator-docs/ops-common-use-cases/index.html index 5c50ca104de..a9a7eb8a487 100644 --- a/docs/v11.1.0/operator-docs/ops-common-use-cases/index.html +++ b/docs/v11.1.0/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v11.1.0/operator-docs/trigger-workflow/index.html b/docs/v11.1.0/operator-docs/trigger-workflow/index.html index 512b3c11441..5c0ae67535a 100644 --- a/docs/v11.1.0/operator-docs/trigger-workflow/index.html +++ b/docs/v11.1.0/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v11.1.0

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/tasks/index.html b/docs/v11.1.0/tasks/index.html index db93fe8f1e2..15cf57b4a87 100644 --- a/docs/v11.1.0/tasks/index.html +++ b/docs/v11.1.0/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v11.1.0

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/v11.1.0/team/index.html b/docs/v11.1.0/team/index.html index 6bfc6f3257b..5ec25284d1e 100644 --- a/docs/v11.1.0/team/index.html +++ b/docs/v11.1.0/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v11.1.0/troubleshooting/index.html b/docs/v11.1.0/troubleshooting/index.html index 6b912a26cf5..e3e048fba80 100644 --- a/docs/v11.1.0/troubleshooting/index.html +++ b/docs/v11.1.0/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v11.1.0

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/troubleshooting/reindex-elasticsearch/index.html b/docs/v11.1.0/troubleshooting/reindex-elasticsearch/index.html index 34ca83aeb36..ce4f301e998 100644 --- a/docs/v11.1.0/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/v11.1.0/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/troubleshooting/rerunning-workflow-executions/index.html b/docs/v11.1.0/troubleshooting/rerunning-workflow-executions/index.html index 5028ba171d3..400e10f7177 100644 --- a/docs/v11.1.0/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/v11.1.0/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v11.1.0

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/v11.1.0/troubleshooting/troubleshooting-deployment/index.html b/docs/v11.1.0/troubleshooting/troubleshooting-deployment/index.html index 33fb350433b..1925b41ede3 100644 --- a/docs/v11.1.0/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/v11.1.0/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/v11.1.0/upgrade-notes/cumulus_distribution_migration/index.html b/docs/v11.1.0/upgrade-notes/cumulus_distribution_migration/index.html index 80cfa731509..2a4baa03dbe 100644 --- a/docs/v11.1.0/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/v11.1.0/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: v11.1.0

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/upgrade-notes/migrate_tea_standalone/index.html b/docs/v11.1.0/upgrade-notes/migrate_tea_standalone/index.html index 3ed13819b63..de55bb41a3d 100644 --- a/docs/v11.1.0/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/v11.1.0/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v11.1.0

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/upgrade-notes/update-cma-2.0.2/index.html b/docs/v11.1.0/upgrade-notes/update-cma-2.0.2/index.html index 7ee8e279074..877b43f2a43 100644 --- a/docs/v11.1.0/upgrade-notes/update-cma-2.0.2/index.html +++ b/docs/v11.1.0/upgrade-notes/update-cma-2.0.2/index.html @@ -5,13 +5,13 @@ Upgrade to CMA 2.0.2 | Cumulus Documentation - +
    Version: v11.1.0

    Upgrade to CMA 2.0.2

    Updating a Cumulus Deployment to CMA 2.0.2

    Background

    The Cumulus Message Adapter has been updated in release 2.0.2 to no longer utilize the AWS step function API to look up the defined name of a step function task for population in meta.workflow_tasks, but instead use an incrementing integer field.

    Additionally a bugfix was released in the form of v2.0.1/v2.0.2 following the initial 2.0.0 release, so all users should update to release 2.0.2

    The update is not tied to a particular version of Core, however the update should be done across all task components in order to ensure consistent execution records.

    Changes

    Execution Record Update

    This update functionally means that Cumulus tasks/activities using the CMA will now record a record that looks like the following in meta.workflowtasks, and more importantly in the tasks column for an execution record:

    Original

          "DiscoverGranules": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "QueueGranules": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    New

          "0": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "1": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    Actions Required

    The following should be done as part of a Cumulus stack update to utilize cumulus message adapter > 2.0.2:

    • Python tasks that utilize cumulus-message-adapter-python should be updated to use > 2.0.0, their lambdas rebuilt and Cumulus workflows reconfigured to use the updated version.

    • Python activities that utilize cumulus-process-py should be rebuilt using > 1.0.0 with updated dependencies, and have their images deployed/Cumulus configured to use the new version.

    • The cumulus-message-adapter v2.0.2 lambda layer should be made available in the deployment account, and the Cumulus deployment should be reconfigured to use it (via the cumulus_message_adapter_lambda_layer_version_arn variable in the cumulus module). This should address all Core node.js tasks that utilize the CMA, and many contributed node.js/JAVA components.

    Once the above have been done, redeploy Cumulus to apply the configuration and the updates should be live.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/upgrade-notes/update-task-file-schemas/index.html b/docs/v11.1.0/upgrade-notes/update-task-file-schemas/index.html index 165473e29f4..6f8d729bd3b 100644 --- a/docs/v11.1.0/upgrade-notes/update-task-file-schemas/index.html +++ b/docs/v11.1.0/upgrade-notes/update-task-file-schemas/index.html @@ -5,13 +5,13 @@ Updates to task granule file schemas | Cumulus Documentation - +
    Version: v11.1.0

    Updates to task granule file schemas

    Background

    Most Cumulus workflow tasks expect as input a payload of granule(s) which contain the files for each granule. Most tasks also return this same granule structure as output.

    However, up to this point, there was inconsistency in the schemas for the granule files objects expected by each task. Furthermore, there was no guarantee of consistency between granule files objects as stored in the database and the expectations of any given workflow task.

    Thus, when performing bulk granule operations which pass granules from the database into a Cumulus workflow, it was possible for there to be schema validation failures depending on which task was used to start the workflow and its particular schema.

    In order to rectify this situation, CUMULUS-2388 was filed and addressed to create a common granule files schema between nearly all of the Cumulus tasks (exceptions discussed below) and the Cumulus database. The following documentation explains the manual changes you need to make to your deployment in order to be compatible with the updated files schema.

    Updated files schema

    The updated granule files schema can be found here.

    These former properties were deprecated (with notes about how to derive the same information from the updated schema, if possible):

    • filename - concatenate the bucket and key values with a directory separator (/)
    • name - use fileName property
    • etag - ETags are no longer provided as an individual file property. Instead, a separate etags object mapping S3 URIs to ETag values is provided as output from the following workflow tasks (guidance on how to integrate this output with your workflows is provided in the Upgrading your workflows section below):
      • update-granules-cmr-metadata-file-links
      • hyrax-metadata-updates
    • fileStagingDir - no longer supported
    • url_path - no longer supported
    • duplicate_found - This property is no longer supported, however sync-granule and move-granules now produce a separate granuleDuplicates object as part of their output. The granuleDuplicates object is a map of granules by granule ID which includes the files that encountered duplicates during processing. Guidance on how to integrate granuleDuplicates information into your workflow configuration is provided below.

    Exceptions

    These workflow tasks did not have their schema for granule files updated:

    • discover-granules - no updates
    • queue-granules - no updates
    • parse-pdr - no updates
    • sync-granule - input schema not updated, output schema was updated

    The reason that these task schemas were not updated is that all of these tasks start before the files have been ingested to S3, thus much of the information that is required in the updated files schema like bucket, key, or checksum is not yet known.

    Bulk granule operations

    Since the input schema for the above tasks was not updated, that means you cannot run bulk granule operations against workflows if they start with any of those tasks. Bulk granule operations work by loading the specified granules from the database and sending them as input to a specified workflow, so if the specified workflow begins with a task whose input schema does not conform to what is coming out of the database, there will be schema errors.

    Upgrading your deployment

    Upgrading your workflows

    For any workflows using the update-granules-cmr-metadata-file-links task before the hyrax-metadata-updates and/or post-to-cmr tasks, update the step definition for update-granules-cmr-metadata-file-links as follows:

        "UpdateGranulesCmrMetadataFileLinksStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    hyrax-metadata-updates

    For any workflows using the hyrax-metadata-updates task before a post-to-cmr task, update the definition of the hyrax-metadata-updates step as follows:

        "HyraxMetadataUpdatesTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    post-to-cmr

    For any workflows using post-to-cmr task after the update-granules-cmr-metadata-file-links or hyrax-metadata-updates tasks, update the post-to-cmr step definition as follows:

        "CmrStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}"
    }
    }
    },
    ...more configuration...

    Example workflow

    For an example workflow integrating all of these changes, please see our example ingest and publish workflow.

    Optional - Integrate granuleDuplicates information

    Please note that the granuleDuplicates output is purely informational and does not have any bearing on the separate configuration for how duplicates should be handled.

    You can include granuleDuplicates output from the sync-granule or move-granules tasks in your workflow messages like so:

        "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    ...other config...
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granuleDuplicates}",
    "destination": "{$.meta.sync_granule.granule_duplicates}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    }
    ...more configuration...

    The result of this configuration is that the granuleDuplicates output from sync-granule would be placed in meta.sync_granule.granule_duplicates on the workflow message and remain there throughout the rest of the workflow. The same configuration could be replicated for the move-granules task, but be sure to use a different destination in the workflow message for the granuleDuplicates output .

    Updating collection URL path templates

    Collections can specify url_path templates to dynamically generate the final location of files. As part of url_path templates, file object properties can be interpolated to generate the file path. Thus, these url_path templates need to be updated to ensure that they are compatible with the updated files schema and the properties that will actually be available on file objects.

    See the notes on the updated files schema to know which properties are available and which previously existing properties were deprecated.

    As an example, you will want to update any url_path properties in your collections to remove references to file.name and replace them with references to file.fileName like so:

    - "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.name, 0, 3)}",
    + "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.fileName, 0, 3)}",
    - + \ No newline at end of file diff --git a/docs/v11.1.0/upgrade-notes/upgrade-rds/index.html b/docs/v11.1.0/upgrade-notes/upgrade-rds/index.html index 5a290ac72f0..a098a9119ce 100644 --- a/docs/v11.1.0/upgrade-notes/upgrade-rds/index.html +++ b/docs/v11.1.0/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/v11.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/v11.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html index 27eddcd1216..35e1b081aea 100644 --- a/docs/v11.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/v11.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v11.1.0

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the upgrade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflow_tasks/discover_granules/index.html b/docs/v11.1.0/workflow_tasks/discover_granules/index.html index ec7412c0806..6aab2a5d5c3 100644 --- a/docs/v11.1.0/workflow_tasks/discover_granules/index.html +++ b/docs/v11.1.0/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflow_tasks/files_to_granules/index.html b/docs/v11.1.0/workflow_tasks/files_to_granules/index.html index 5ea9a848390..58444bea981 100644 --- a/docs/v11.1.0/workflow_tasks/files_to_granules/index.html +++ b/docs/v11.1.0/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v11.1.0

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • size
    • bucket
    • key
    • fileName

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflow_tasks/lzards_backup/index.html b/docs/v11.1.0/workflow_tasks/lzards_backup/index.html index b367c17fe67..49ee035a042 100644 --- a/docs/v11.1.0/workflow_tasks/lzards_backup/index.html +++ b/docs/v11.1.0/workflow_tasks/lzards_backup/index.html @@ -5,13 +5,13 @@ LZARDS Backup | Cumulus Documentation - +
    Version: v11.1.0

    LZARDS Backup

    The LZARDS backup task takes an array of granules and initiates backup requests to the LZARDS API, which will be handled asynchronously by LZARDS.

    Deployment

    The LZARDS backup task is not automatically deployed with Cumulus. To deploy the task through the Cumulus module, first you must specify a lzards_launchpad_passphrase in your terraform variables (e.g. variables.tf) like so:

    variable "lzards_launchpad_passphrase" {
    type = string
    default = ""
    }

    Then you can specify a value for your lzards_launchpad_passphrase in terraform.tfvars like so:

    lzards_launchpad_passphrase = your-passphrase

    Lastly, you need to make sure that the lzards_launchpad_passphrase is passed into the Cumulus module (in main.tf) like so:

    lzards_launchpad_passphrase  = var.lzards_launchpad_passphrase

    In short, deploying the LZARDS task requires configuring a passphrase variable and ensuring that your TF configuration passes that variable into the Cumulus module.

    Additional terraform configuration for the LZARDS task can be found in the cumulus module's variables.tf file, where the the relevant variables are prefixed with lzards_. You can add these variables to your deployment using the same process outlined above for lzards_launchpad_passphrase.

    Task Inputs

    Input

    This task expects an array of granules as input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Task Outputs

    Output

    The LZARDS task outputs a composite object containing:

    • the input granules array, and
    • a backupResults object that describes the results of LZARDS backup attempts.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflow_tasks/move_granules/index.html b/docs/v11.1.0/workflow_tasks/move_granules/index.html index b40a2f1a594..d1013b16e7f 100644 --- a/docs/v11.1.0/workflow_tasks/move_granules/index.html +++ b/docs/v11.1.0/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v11.1.0

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflow_tasks/parse_pdr/index.html b/docs/v11.1.0/workflow_tasks/parse_pdr/index.html index 3ae88df10b2..2a11fbe17cd 100644 --- a/docs/v11.1.0/workflow_tasks/parse_pdr/index.html +++ b/docs/v11.1.0/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v11.1.0

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflow_tasks/queue_granules/index.html b/docs/v11.1.0/workflow_tasks/queue_granules/index.html index 6cf9b291be4..1fe98941811 100644 --- a/docs/v11.1.0/workflow_tasks/queue_granules/index.html +++ b/docs/v11.1.0/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: v11.1.0

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflows/cumulus-task-message-flow/index.html b/docs/v11.1.0/workflows/cumulus-task-message-flow/index.html index 14015d577e8..55c4cba9635 100644 --- a/docs/v11.1.0/workflows/cumulus-task-message-flow/index.html +++ b/docs/v11.1.0/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v11.1.0

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflows/developing-a-cumulus-workflow/index.html b/docs/v11.1.0/workflows/developing-a-cumulus-workflow/index.html index 99e5423317d..8117db0ea45 100644 --- a/docs/v11.1.0/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/v11.1.0/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v11.1.0

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflows/developing-workflow-tasks/index.html b/docs/v11.1.0/workflows/developing-workflow-tasks/index.html index fc561449ad0..f85cf6686a5 100644 --- a/docs/v11.1.0/workflows/developing-workflow-tasks/index.html +++ b/docs/v11.1.0/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v11.1.0

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflows/docker/index.html b/docs/v11.1.0/workflows/docker/index.html index b69b7a818ac..c21e0b51c09 100644 --- a/docs/v11.1.0/workflows/docker/index.html +++ b/docs/v11.1.0/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflows/index.html b/docs/v11.1.0/workflows/index.html index 16e6012b9f6..ddb4d4f3d10 100644 --- a/docs/v11.1.0/workflows/index.html +++ b/docs/v11.1.0/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v11.1.0

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflows/input_output/index.html b/docs/v11.1.0/workflows/input_output/index.html index 1d96ab2ce4e..af9bfbcb736 100644 --- a/docs/v11.1.0/workflows/input_output/index.html +++ b/docs/v11.1.0/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v11.1.0

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflows/lambda/index.html b/docs/v11.1.0/workflows/lambda/index.html index 7e957f8822d..45d23c4d9da 100644 --- a/docs/v11.1.0/workflows/lambda/index.html +++ b/docs/v11.1.0/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v11.1.0

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflows/protocol/index.html b/docs/v11.1.0/workflows/protocol/index.html index aebb525177c..3854280e51e 100644 --- a/docs/v11.1.0/workflows/protocol/index.html +++ b/docs/v11.1.0/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v11.1.0

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflows/workflow-configuration-how-to/index.html b/docs/v11.1.0/workflows/workflow-configuration-how-to/index.html index 3f5bdb61eeb..e5ccd92fb66 100644 --- a/docs/v11.1.0/workflows/workflow-configuration-how-to/index.html +++ b/docs/v11.1.0/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -24,7 +24,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.fileName, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/v11.1.0/workflows/workflow-triggers/index.html b/docs/v11.1.0/workflows/workflow-triggers/index.html index 83e72096c75..8489965575e 100644 --- a/docs/v11.1.0/workflows/workflow-triggers/index.html +++ b/docs/v11.1.0/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v11.1.0

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/v12.0.0/adding-a-task/index.html b/docs/v12.0.0/adding-a-task/index.html index d53c8315e1d..ae9c6974d4d 100644 --- a/docs/v12.0.0/adding-a-task/index.html +++ b/docs/v12.0.0/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: v12.0.0

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/api/index.html b/docs/v12.0.0/api/index.html index da07eeb1cad..2fa72418ed7 100644 --- a/docs/v12.0.0/api/index.html +++ b/docs/v12.0.0/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v12.0.0/architecture/index.html b/docs/v12.0.0/architecture/index.html index c3166be6b1b..7f663545549 100644 --- a/docs/v12.0.0/architecture/index.html +++ b/docs/v12.0.0/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: v12.0.0

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a set of PostgreSQL compatible database, and is exported to an Elasticsearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries. Currently the entity state data is replicated in DynamoDB and this will be removed in a future release.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Database

    Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance. For legacy reasons, Cumulus is currently also deploying a DynamoDB datastore, and writes are replicated in parallel with the PostgreSQL database writes. The DynamoDB replicated tables and parallel writes will be removed in future releases.

    PostgreSQL Database Schema Diagram

    ERD of the Cumulus Database

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/v12.0.0/configuration/cloudwatch-retention/index.html b/docs/v12.0.0/configuration/cloudwatch-retention/index.html index cca4fdffe34..f82e17579ef 100644 --- a/docs/v12.0.0/configuration/cloudwatch-retention/index.html +++ b/docs/v12.0.0/configuration/cloudwatch-retention/index.html @@ -5,13 +5,13 @@ Cloudwatch Retention | Cumulus Documentation - +
    Version: v12.0.0

    Cloudwatch Retention

    Our lambdas dump logs to AWS CloudWatch. By default, these logs exist indefinitely. However, there are ways to specify a duration for log retention.

    aws-cli

    In addition to getting your aws-cli set-up, there are two values you'll need to acquire.

    1. log-group-name: the name of the log group who's retention policy (retention time) you'd like to change. We'll use /aws/lambda/KinesisInboundLogger in our examples.
    2. retention-in-days: the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    For example, if we wanted to set log retention to 30 days on our KinesisInboundLogger lambda, we would write:

    aws logs put-retention-policy --log-group-name "/aws/lambda/KinesisInboundLogger" --retention-in-days 30

    Note: The aws-cli log command that we're using is explained in detail here.

    AWS Management Console

    Changing the log retention policy in the AWS Management Console is a fairly simple process:

    1. Navigate to the CloudWatch service in the AWS Management Console.
    2. Click on the Logs entry on the sidebar.
    3. Find the Log Group who's retention policy you're interested in changing.
    4. Click on the value in the Expire Events After column.
    5. Enter/Select the number of days you'd like to retain logs in that log group for.

    Screenshot of AWS console showing how to configure the retention period for Cloudwatch logs

    - + \ No newline at end of file diff --git a/docs/v12.0.0/configuration/collection-storage-best-practices/index.html b/docs/v12.0.0/configuration/collection-storage-best-practices/index.html index 326d83bdfef..7ccd181f955 100644 --- a/docs/v12.0.0/configuration/collection-storage-best-practices/index.html +++ b/docs/v12.0.0/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: v12.0.0

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/v12.0.0/configuration/data-management-types/index.html b/docs/v12.0.0/configuration/data-management-types/index.html index a1125c1a11d..77dc9d771ca 100644 --- a/docs/v12.0.0/configuration/data-management-types/index.html +++ b/docs/v12.0.0/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: v12.0.0

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    Please note:

    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certificateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certiciateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22
    privateKeystringNofilename assumed to be in s3://bucketInternal/stackName/crypto
    cmKeyIdstringNoAWS KMS Customer Master Key arn or alias

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.fileName, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.fileName, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v12.0.0/configuration/lifecycle-policies/index.html b/docs/v12.0.0/configuration/lifecycle-policies/index.html index 324efe5a292..059852aa322 100644 --- a/docs/v12.0.0/configuration/lifecycle-policies/index.html +++ b/docs/v12.0.0/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: v12.0.0

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walk-through on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/v12.0.0/configuration/monitoring-readme/index.html b/docs/v12.0.0/configuration/monitoring-readme/index.html index c22978a835c..27bf4a48cc2 100644 --- a/docs/v12.0.0/configuration/monitoring-readme/index.html +++ b/docs/v12.0.0/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: v12.0.0

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return await Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/configuration/server_access_logging/index.html b/docs/v12.0.0/configuration/server_access_logging/index.html index 6cfb6b5c895..09f96a49462 100644 --- a/docs/v12.0.0/configuration/server_access_logging/index.html +++ b/docs/v12.0.0/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: v12.0.0

    S3 Server Access Logging

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/v12.0.0/configuration/task-configuration/index.html b/docs/v12.0.0/configuration/task-configuration/index.html index a9f25647ca1..bfa03df0146 100644 --- a/docs/v12.0.0/configuration/task-configuration/index.html +++ b/docs/v12.0.0/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
    Version: v12.0.0

    Configuration of Tasks

    The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

    cmr_search_client_config

    Configuration parameters for CMR search client for cumulus archive module tasks in the form:

    <lambda_identifier>_report_cmr_limit = <maximum number records can be returned from cmr-client search, this should be greater than cmr_page_size>
    <lambda_identifier>_report_cmr_page_size = <number of records for each page returned from CMR>
    type = map(string)

    More information about cmr limit and cmr page_size can be found from @cumulus/cmr-client and CMR Search API document.

    Currently the following values are supported:

    • create_reconciliation_report_cmr_limit
    • create_reconciliation_report_cmr_page_size

    Example

    cmr_search_client_config = {
    create_reconciliation_report_cmr_limit = 2500
    create_reconciliation_report_cmr_page_size = 250
    }

    elasticsearch_client_config

    Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

    <lambda_identifier>_es_scroll_duration = <duration>
    <lambda_identifier>_es_scroll_size = <size>
    type = map(string)

    Currently the following values are supported:

    • create_reconciliation_report_es_scroll_duration
    • create_reconciliation_report_es_scroll_size

    Example

    elasticsearch_client_config = {
    create_reconciliation_report_es_scroll_duration = "15m"
    create_reconciliation_report_es_scroll_size = 2000
    }

    lambda_timeouts

    A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_timeout: <timeout>
    type = map(string)

    Currently the following values are supported:

    • discover_granules_task_timeout
    • discover_pdrs_task_timeout
    • hyrax_metadata_update_tasks_timeout
    • lzards_backup_task_timeout
    • move_granules_task_timeout
    • parse_pdr_task_timeout
    • pdr_status_check_task_timeout
    • post_to_cmr_task_timeout
    • queue_granules_task_timeout
    • queue_pdrs_task_timeout
    • queue_workflow_task_timeout
    • sync_granule_task_timeout
    • update_granules_cmr_metadata_file_links_task_timeout

    Example

    lambda_timeouts = {
    discover_granules_task_timeout = 300
    }
    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/about-cookbooks/index.html b/docs/v12.0.0/data-cookbooks/about-cookbooks/index.html index f3fe98e506b..ca061309436 100644 --- a/docs/v12.0.0/data-cookbooks/about-cookbooks/index.html +++ b/docs/v12.0.0/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: v12.0.0

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to configure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/browse-generation/index.html b/docs/v12.0.0/data-cookbooks/browse-generation/index.html index 4608829daf8..1b49396b816 100644 --- a/docs/v12.0.0/data-cookbooks/browse-generation/index.html +++ b/docs/v12.0.0/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg"
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/choice-states/index.html b/docs/v12.0.0/data-cookbooks/choice-states/index.html index 298a01519a1..599cf02e6d0 100644 --- a/docs/v12.0.0/data-cookbooks/choice-states/index.html +++ b/docs/v12.0.0/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: v12.0.0

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/cnm-workflow/index.html b/docs/v12.0.0/data-cookbooks/cnm-workflow/index.html index d41f1a23ec1..7d936a2e81e 100644 --- a/docs/v12.0.0/data-cookbooks/cnm-workflow/index.html +++ b/docs/v12.0.0/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v12.0.0

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/kinesis_trigger_test_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }

    Please Note:

    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

    Please note: The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/error-handling/index.html b/docs/v12.0.0/data-cookbooks/error-handling/index.html index fb6797d3556..48a0920731e 100644 --- a/docs/v12.0.0/data-cookbooks/error-handling/index.html +++ b/docs/v12.0.0/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/hello-world/index.html b/docs/v12.0.0/data-cookbooks/hello-world/index.html index 760dba0c34c..bdf8c64192d 100644 --- a/docs/v12.0.0/data-cookbooks/hello-world/index.html +++ b/docs/v12.0.0/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: v12.0.0

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/ingest-notifications/index.html b/docs/v12.0.0/data-cookbooks/ingest-notifications/index.html index 703d8ddcdd8..f0b72660e88 100644 --- a/docs/v12.0.0/data-cookbooks/ingest-notifications/index.html +++ b/docs/v12.0.0/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: v12.0.0

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics, one for executions, granules, and PDRs, are created and used for handling notification messages related to the workflow.

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates the RDS database records for granules, executions, and PDRs. When the records are updated, messages are posted to the three SNS topics. This Lambda is invoked both when the workflow starts and when it reaches a terminal state (completion or failure).

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/queue-post-to-cmr/index.html b/docs/v12.0.0/data-cookbooks/queue-post-to-cmr/index.html index e04ac238dfe..b3f7e786efd 100644 --- a/docs/v12.0.0/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/v12.0.0/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: v12.0.0

    Queue PostToCmr

    In this document, we walk through handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/v12.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index 2393bdc02f4..51e7e9c7463 100644 --- a/docs/v12.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/v12.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: v12.0.0

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.7.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn,
    "--lastModified",
    module.cumulus.queue_granules_task.last_modified_date
    ]
    alarms = {
    MemoryUtilizationHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 75
    }
    }
    }

    Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/sips-workflow/index.html b/docs/v12.0.0/data-cookbooks/sips-workflow/index.html index e86a77a2c41..eb9b80a507f 100644 --- a/docs/v12.0.0/data-cookbooks/sips-workflow/index.html +++ b/docs/v12.0.0/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/throttling-queued-executions/index.html b/docs/v12.0.0/data-cookbooks/throttling-queued-executions/index.html index 6326d787523..daab06512cd 100644 --- a/docs/v12.0.0/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/v12.0.0/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: v12.0.0

    Throttling queued executions

    In this entry, we will walk through how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/v12.0.0/data-cookbooks/tracking-files/index.html b/docs/v12.0.0/data-cookbooks/tracking-files/index.html index f92d3a4d5be..0f0e71c4e8a 100644 --- a/docs/v12.0.0/data-cookbooks/tracking-files/index.html +++ b/docs/v12.0.0/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/api-gateway-logging/index.html b/docs/v12.0.0/deployment/api-gateway-logging/index.html index 1dde29c3ea8..0bd6d8e0108 100644 --- a/docs/v12.0.0/deployment/api-gateway-logging/index.html +++ b/docs/v12.0.0/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: v12.0.0

    API Gateway Logging

    Enabling API Gateway logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions for enabling account level logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    Create a policy document

    The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

    Save this snippet as apigateway-policy.json.

    {
    "Version": "2012-10-17",
    "Statement": [
    {
    "Sid": "",
    "Effect": "Allow",
    "Principal": {
    "Service": "apigateway.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
    }
    ]
    }

    Create an account role to act as ApiGateway and write to CloudWatchLogs

    NASA users in NGAP: be sure to use your account's permission boundary.

    aws iam create-role \
    --role-name ApiGatewayToCloudWatchLogs \
    [--permissions-boundary <permissionBoundaryArn>] \
    --assume-role-policy-document file://apigateway-policy.json

    Note the ARN of the returned role for the last step.

    Attach correct permissions to role

    Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

    aws iam attach-role-policy \
    --role-name ApiGatewayToCloudWatchLogs \
    --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"

    Update Account API Gateway settings with correct permissions

    Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

    aws apigateway update-account \
    --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    See Configure Cloudwatch Logs Delivery

    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/choosing_configuring_rds/index.html b/docs/v12.0.0/deployment/choosing_configuring_rds/index.html index aed397ca1b0..f910485a89f 100644 --- a/docs/v12.0.0/deployment/choosing_configuring_rds/index.html +++ b/docs/v12.0.0/deployment/choosing_configuring_rds/index.html @@ -5,7 +5,7 @@ Choosing and configuration your RDS database | Cumulus Documentation - + @@ -37,7 +37,7 @@ using this module to create your RDS cluster, you can configure the autoscaling timeout action, the cluster minimum and maximum capacity, and more as seen in the supported variables for the module.

    Unfortunately, Terraform currently doesn't allow specifying the autoscaling timeout itself, so that value will have to be manually configured in the AWS console or CLI.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/cloudwatch-logs-delivery/index.html b/docs/v12.0.0/deployment/cloudwatch-logs-delivery/index.html index 0475376b8d4..9165f155ab2 100644 --- a/docs/v12.0.0/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/v12.0.0/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: v12.0.0

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    Be default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/components/index.html b/docs/v12.0.0/deployment/components/index.html index 4ff9eb36fc8..ac7b5775224 100644 --- a/docs/v12.0.0/deployment/components/index.html +++ b/docs/v12.0.0/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walk-through of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/create_bucket/index.html b/docs/v12.0.0/deployment/create_bucket/index.html index 9448791cf9a..88f7b46a6d7 100644 --- a/docs/v12.0.0/deployment/create_bucket/index.html +++ b/docs/v12.0.0/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: v12.0.0

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command line

    Using the AWS command line tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }

    Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web interface

    See: AWS "Creating a Bucket" documentation

    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/cumulus_distribution/index.html b/docs/v12.0.0/deployment/cumulus_distribution/index.html index 917d04c17b2..04758407550 100644 --- a/docs/v12.0.0/deployment/cumulus_distribution/index.html +++ b/docs/v12.0.0/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
    Version: v12.0.0

    Using the Cumulus Distribution API

    The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

    Configuring a Cumulus Distribution deployment

    The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

    These steps assume you're using the Cumulus Deployment Template but can also be used for custom deployments.

    To configure a deployment to use Cumulus Distribution:

    1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the Cumulus Distribution settings.
    2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
    3. Uncomment the Cumulus Distribution outputs in outputs.tf.
    4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

    Cognito Application and User Credentials

    The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

    Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

    Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

    • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
    • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
    • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

    Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

    Cumulus Distribution URL

    Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

    By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

    In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

    You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file, and setting it to one of the following values (both of which are explained below):

    1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
    2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

    The following subsections explain these approaches, in turn.

    Using your Cumulus Distribution API Gateway URL as your distribution URL

    Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development), as described here. Here is an outline of the required steps, with details provided further below:

    1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
    2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
    3. Choose an open local port on your machine (we'll use 9000 in the following details)
    4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
    5. Redeploy Cumulus
    6. Add an entry to your /etc/hosts file
    7. Add a redirect URI to Cognito, via the Cognito API
    8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
    9. Add a sample file to S3 to test downloading via Cognito

    To create or import an existing key pair, you can use the AWS CLI (see aws ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

    Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

    key_name = "<name>"
    cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

    where:

    • <name> is the name of the key pair you just added to AWS
    • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
    • <port> is your open local port of choice (9000 is typically a good choice)

    Once you save your variable changes, redeploy your cumulus-tf module.

    While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

    localhost <hostname>

    Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

    • method: POST
    • base URL: the value of your csdap_host_url Terraform variable
    • path: /authclient/updateRedirectUri
    • username: the value of your csdap_client_id Terraform variable
    • password: the value of your csdap_client_password Terraform variable
    • headers: Content-Type='application/x-www-form-urlencoded'
    • body: redirect_uri=<cumulus_distribution_url>/login

    where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

    For reference, see the Cognito Authentication Service API.

    Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

    brew install --cask session-manager-plugin --no-quarantine

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

    1. Determine your ec2 instance ID
    2. Connect to the NASA VPN
    3. Start an AWS SSM session
    4. Open an ssh tunnel
    5. Use a browser to navigate to your file

    To determine your ec2 instance ID for your Cumulus deployment, run the follow command, where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

    aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text

    IMPORTANT: Before proceeding with the remaining steps, make sure you're connected to the NASA VPN.

    Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

    aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

    If successful, you should see output similar to the following:

    Starting session with SessionId: NGAPShApplicationDeveloper-***
    Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
    Waiting for connections...

    Open another terminal window, and open a tunnel with port forwarding, using your chosen port from above (e.g., 9000):

    ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

    where:

    • <port> is the open local port you chose earlier (e.g., 9000)
    • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    Once you're finished testing, clean up as follows:

    1. Kill your ssh tunnel (Ctrl-C)
    2. Kill your AWS SSM session (Ctrl-C)
    3. If you like, disconnect from the NASA VPC

    While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

    Using a CloudFront URL as your distribution URL

    In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

    This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

    • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
    • A request to add the AWS account's VPC to the whitelist

    Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

    cumulus_distribution_url = <cloudfront_url>

    In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

    At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

    Redeploy Cumulus with your new/updated Terraform variables.

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    S3 Bucket Mapping

    An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

    The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Note: Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Switching from the Thin Egress App to Cumulus Distribution

    If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

    Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

    There will also be downtime while your API gateway is updated.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/index.html b/docs/v12.0.0/deployment/index.html index aa6972a1d6c..797bdea0789 100644 --- a/docs/v12.0.0/deployment/index.html +++ b/docs/v12.0.0/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -21,7 +21,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Choose a distribution API

    Cumulus can be configured to use either the Thin Egress App (TEA) or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

    IMPORTANT! If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    Configure the Thin Egress App

    The Thin Egress App can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the documentation to configure distribution in your cumulus-tf deployment.

    Configure the Cumulus Distribution API (optional)

    If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

    Note: Be sure to copy the redirect URLs, as you will use them to update your Earthdata application.

    Update Earthdata Application

    You will need to add two redirect URLs to your EarthData login application.

    1. Login to URS.
    2. Under My Applications -> Application Administration -> use the edit icon of your application.
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token.
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1.
    5. You may delete the placeholder url you used to create the application.

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus dashboard

    Dashboard Requirements

    Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the dashboard repository.

    Prepare AWS

    Create S3 bucket for dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install dashboard

    To install the dashboard, clone the Cumulus dashboard repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard versioning

    By default, the master branch will be used for dashboard deployments. The master branch of the dashboard repo contains the most recent stable release of the dashboard.

    If you want to test unreleased changes to the dashboard, use the develop branch.

    Each release/version of the dashboard will have a tag in the dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the dashboard

    Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build the dashboard from the dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard deployment

    Deploy dashboard to s3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard --acl public-read

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and login with a user that you configured for access in the Configure and Deploy the Cumulus Stack step.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 instances and autoscaling groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/postgres_database_deployment/index.html b/docs/v12.0.0/deployment/postgres_database_deployment/index.html index 8ed65015d0a..a6554ccc3c8 100644 --- a/docs/v12.0.0/deployment/postgres_database_deployment/index.html +++ b/docs/v12.0.0/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 10.2 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases, and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs, and the Aurora Serverless V1 docs.

    Prepare deployment repository

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a new repository

    Create a new repository on Github so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

    --

    Configure and deploy the module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc. This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision user and user database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true, this configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to
    • prefix -- this value will be used to set a unique identifier the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda, and run it on each provision, creating the configured database if it does not exist, updating the user password if that value has been changed, and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    Please Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see output like:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secret required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are is in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Next Steps

    The database cluster has been created/updated! From here you can continue to add additional user accounts, databases and other database configuration.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/share-s3-access-logs/index.html b/docs/v12.0.0/deployment/share-s3-access-logs/index.html index 7e58b142fee..1b98e439614 100644 --- a/docs/v12.0.0/deployment/share-s3-access-logs/index.html +++ b/docs/v12.0.0/deployment/share-s3-access-logs/index.html @@ -5,14 +5,14 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: v12.0.0

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a node package that contains a simple lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First ensure that you have enabled S3 Server Access Logging.

    Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition. e.g.

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The terraform source package can be found on the Cumulus github release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 replicator as described above using the target_bucket and target_prefix provided by the metrics team.

    The metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/terraform-best-practices/index.html b/docs/v12.0.0/deployment/terraform-best-practices/index.html index d9ac0bdfdbb..eba28dcdd73 100644 --- a/docs/v12.0.0/deployment/terraform-best-practices/index.html +++ b/docs/v12.0.0/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    Configuring the Cumulus deployment: link Restoring a previous version: link

    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/thin_egress_app/index.html b/docs/v12.0.0/deployment/thin_egress_app/index.html index fe2908167c1..948b86b35b4 100644 --- a/docs/v12.0.0/deployment/thin_egress_app/index.html +++ b/docs/v12.0.0/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v12.0.0

    Using the Thin Egress App for Cumulus distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a secret for signing Thin Egress App JWTs

    The Thin Egress App uses JWTs internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Please note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally configure a custom bucket map

    A simple config would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public

    Please note: your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally configure shared variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus core example deployment code.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/deployment/upgrade-readme/index.html b/docs/v12.0.0/deployment/upgrade-readme/index.html index 3461f0a0114..dca5ec7d2dc 100644 --- a/docs/v12.0.0/deployment/upgrade-readme/index.html +++ b/docs/v12.0.0/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/development/forked-pr/index.html b/docs/v12.0.0/development/forked-pr/index.html index 7d738e0b8c8..aefdf410fd8 100644 --- a/docs/v12.0.0/development/forked-pr/index.html +++ b/docs/v12.0.0/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: v12.0.0

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/development/integration-tests/index.html b/docs/v12.0.0/development/integration-tests/index.html index fc4723d5062..9792fc76524 100644 --- a/docs/v12.0.0/development/integration-tests/index.html +++ b/docs/v12.0.0/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/development/quality-and-coverage/index.html b/docs/v12.0.0/development/quality-and-coverage/index.html index 4ba4262ea40..dcfe4b06b11 100644 --- a/docs/v12.0.0/development/quality-and-coverage/index.html +++ b/docs/v12.0.0/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/development/release/index.html b/docs/v12.0.0/development/release/index.html index 4c293121783..a18d5c9fca0 100644 --- a/docs/v12.0.0/development/release/index.html +++ b/docs/v12.0.0/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -15,7 +15,7 @@ It's useful to use the search feature of your code editor or grep to see if there any references to the old package versions. In bash shell you can run

    find . -name package.json -exec grep -nH "@cumulus/.*MAJOR\.MINOR\.PATCH.*" {} \;

    Verify that each of those is updated to the new MAJOR.MINOR.PATCH verion you are trying to release.

    A similar search for alpha and beta versions should be run on the release version and any problems should be fixed.

    find . -name package.json -exec grep -nHE "MAJOR\.MINOR\.PATCH.*(alpha|beta)" {} \;

    3. Check Cumulus Dashboard PRs for Version Bump

    There may be unreleased changes in the Cumulus Dashboard project that rely on this unreleased Cumulus Core version.

    If there is exists a PR in the cumulus-dashboard repo with a name containing: "Version Bump for Next Cumulus API Release":

    • There will be a placeholder change-me value that should be replaced with the Cumulus Core to-be-released-version.
    • Mark that PR as ready to be reviewed.

    4. Update CHANGELOG.md

    Update the CHANGELOG.md. Put a header under the Unreleased section with the new version number and the date.

    Add a link reference for the github "compare" view at the bottom of the CHANGELOG.md, following the existing pattern. This link reference should create a link in the CHANGELOG's release header to changes in the corresponding release.

    5. Update DATA_MODEL_CHANGELOG.md

    Similar to #4, make sure the DATA_MODEL_CHANGELOG is updated if there are data model changes in the release, and the link reference at the end of the document is updated as appropriate.

    6. Update CONTRIBUTORS.md

    ./bin/update-contributors.sh
    git add CONTRIBUTORS.md

    Commit and push these changes, if any.

    7. Update Cumulus package API documentation

    Update auto-generated API documentation for any Cumulus packages that have it:

    npm run docs-build-packages

    Commit and push these changes, if any.

    8. Cut new version of Cumulus Documentation

    If this is a backport, do not create a new version of the documentation. For various reasons, we do not merge backports back to master, other than changelog notes. Documentation changes for backports will not be published to our documentation website.

    cd website
    npm run version ${release_version}
    git add .

    Where ${release_version} corresponds to the version tag v1.2.3, for example.

    Commit and push these changes.

    9. Create a pull request against the minor version branch

    1. Push the release branch (e.g. release-1.2.3) to GitHub.

    2. Create a PR against the minor version base branch (e.g. release-1.2.x).

    3. Configure Bamboo to run automated tests against this PR by finding the branch plan for the release branch (release-1.2.3) and setting only these variables:

      • GIT_PR: true
      • SKIP_AUDIT: true

      IMPORTANT: Do NOT set the PUBLISH_FLAG variable to true for this branch plan. The actual publishing of the release will be handled by a separate, manually triggered branch plan.

      Screenshot of Bamboo CI interface showing the configuration of the GIT_PR branch variable to have a value of &quot;true&quot;

    4. Verify that the Bamboo build for the PR succeeds and then merge to the minor version base branch (release-1.2.x).

      • It is safe to do a squash merge in this instance, but not required
    5. You may delete your release branch (release-1.2.3) after merging to the base branch.

    10. Create a git tag for the release

    Check out the minor version base branch (release-1.2.x) now that your changes are merged in and do a git pull.

    Ensure you are on the latest commit.

    Create and push a new git tag:

        git tag -a vMAJOR.MINOR.PATCH -m "Release MAJOR.MINOR.PATCH"
    git push origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -a v9.1.0 -m "Release 9.1.0"
    git push origin v9.1.0

    11. Publishing the release

    Publishing of new releases is handled by a custom Bamboo branch plan and is manually triggered.

    The reasons for using a separate branch plan to handle releases instead of the branch plan for the minor version (e.g. release-1.2.x) are:

    • The Bamboo build for the minor version release branch is triggered automatically on any commits to that branch, whereas we want to manually control when the release is published.
    • We want to verify that integration tests have passed on the Bamboo build for the minor version release branch before we manually trigger the release, so that we can be sure that our code is safe to release.

    If this is a new minor version branch, then you will need to create a new Bamboo branch plan for publishing the release following the instructions below:

    Creating a Bamboo branch plan for the release

    • In the Cumulus Core project (https://ci.earthdata.nasa.gov/browse/CUM-CBA), click Actions -> Configure Plan in the top right.

    • Next to Plan branch click the rightmost button that displays Create Plan Branch upon hover.

    • Click Create plan branch manually.

    • Add the values in that list. Choose a display name that makes it very clear this is a deployment branch plan. Release (minor version branch name) seems to work well (e.g. Release (1.2.x))).

      • Make sure you enter the correct branch name (e.g. release-1.2.x).
    • Important Deselect Enable Branch - if you do not do this, it will immediately fire off a build.

    • Do Immediately On the Branch Details page, enable Change trigger. Set the Trigger type to manual, this will prevent commits to the branch from triggering the build plan. You should have been redirected to the Branch Details tab after creating the plan. If not, navigate to the branch from the list where you clicked Create Plan Branch in the previous step.

    • Go to the Variables tab. Ensure that you are on your branch plan and not the master plan: You should not see a large list of configured variables, but instead a dropdown allowing you to select variables to override, and the tab title will be Branch Variables. Then set the branch variables as follow:

      • DEPLOYMENT: cumulus-from-npm-tf (except in special cases such as incompatible backport branches)
        • If this variable is not set, it will default to the deployment name for the last committer on the branch
      • USE_CACHED_BOOTSTRAP: false
      • USE_TERRAFORM_ZIPS: true (IMPORTANT: MUST be set in order to run integration tests against the .zip files published during the build so that we are actually testing our released files)
      • GIT_PR: true
      • SKIP_AUDIT: true
      • PUBLISH_FLAG: true
    • Enable the branch from the Branch Details page.

    • Run the branch using the Run button in the top right.

    Bamboo will build and run lint and unit tests against that tagged release, publish the new packages to NPM, and then run the integration tests using those newly released packages.

    12. Create a new Cumulus release on github

    The CI release scripts will automatically create a GitHub release based on the release version tag, as well as upload artifacts to the Github release for the Terraform modules provided by Cumulus. The Terraform release artifacts include:

    • A multi-module Terraform .zip artifact containing filtered copies of the tf-modules, packages, and tasks directories for use as Terraform module sources.
    • A S3 replicator module
    • A workflow module
    • A distribution API module
    • An ECS service module

    Just make sure to verify the appropriate .zip files are present on Github after the release process is complete.

    13. Merge base branch back to master

    Finally, you need to reproduce the version update changes back to master.

    If this is the latest version, you can simply create a PR to merge the minor version base branch back to master.

    Do not merge master back into the release branch since we want the release branch to just have the code from the release. Instead, create a new branch off of the release branch and merge that to master. You can freely merge master into this branch and delete it when it is merged to master.

    If this is a backport, you will need to create a PR that ports the changelog updates back to master. It is important in this changelog note to call it out as a backport. For example, fixes in backport version 1.14.5 may not be available in 1.15.0 because the fix was introduced in 1.15.3.

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d vMAJOR.MINOR.PATCH
    git push -d origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -d v9.1.0
    git push -d origin v9.1.0
    - + \ No newline at end of file diff --git a/docs/v12.0.0/docs-how-to/index.html b/docs/v12.0.0/docs-how-to/index.html index 36b6442af85..aa318ba3fc6 100644 --- a/docs/v12.0.0/docs-how-to/index.html +++ b/docs/v12.0.0/docs-how-to/index.html @@ -5,13 +5,13 @@ Cumulus Documentation: How To's | Cumulus Documentation - +
    Version: v12.0.0

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve

    Note: docs-build will build the documents into website/build.

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---

    Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walk-through can be found here. It is worth noting that we would like the Documentation versions to match up directly with release versions. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiKey and an indexName by DocSearch that we include in our website/siteConfig.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for both these values to exist - DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/external-contributions/index.html b/docs/v12.0.0/external-contributions/index.html index e6d00783448..d770bde641d 100644 --- a/docs/v12.0.0/external-contributions/index.html +++ b/docs/v12.0.0/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: v12.0.0

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/faqs/index.html b/docs/v12.0.0/faqs/index.html index 84769f6b606..c2aa8b33442 100644 --- a/docs/v12.0.0/faqs/index.html +++ b/docs/v12.0.0/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: v12.0.0

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    What prerequisites are needed to setup Cumulus?

    Answer: You will need access to the AWS console and an Earthdata login before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a JIRA issue.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    Is Cumulus customizable and flexible?

    Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

    What are Terraform modules?

    Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

    Where do I find Terraform module variables

    Answer: Go here for a list of Cumulus maintained variables.

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting here.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/features/ancillary_metadata/index.html b/docs/v12.0.0/features/ancillary_metadata/index.html index 2b9bcd2ecb7..d4aba022d20 100644 --- a/docs/v12.0.0/features/ancillary_metadata/index.html +++ b/docs/v12.0.0/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v12.0.0

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/features/backup_and_restore/index.html b/docs/v12.0.0/features/backup_and_restore/index.html index d5e9ce2e82e..3fa888faee8 100644 --- a/docs/v12.0.0/features/backup_and_restore/index.html +++ b/docs/v12.0.0/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -52,7 +52,7 @@ writing to the old cluster.

  • Set the snapshot_identifier variable to the snapshot you wish to create, and configure the module like a new deployment, with a unique cluster_identifier

  • Deploy the module using terraform apply

  • Once deployed, verify the cluster has the expected data

  • Redeploy the data persistence and Cumulus deployments - You should not need to reconfigure either, as the secret ARN and the security group should not change, however double-check the configured values are as expected

  • - + \ No newline at end of file diff --git a/docs/v12.0.0/features/dead_letter_archive/index.html b/docs/v12.0.0/features/dead_letter_archive/index.html index acaaff4442b..d1445b20f3b 100644 --- a/docs/v12.0.0/features/dead_letter_archive/index.html +++ b/docs/v12.0.0/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: v12.0.0

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/features/dead_letter_queues/index.html b/docs/v12.0.0/features/dead_letter_queues/index.html index 01595be5481..8372fcd355a 100644 --- a/docs/v12.0.0/features/dead_letter_queues/index.html +++ b/docs/v12.0.0/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v12.0.0

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/v12.0.0/features/distribution-metrics/index.html b/docs/v12.0.0/features/distribution-metrics/index.html index ac8fd2b9bbd..5c4c8d456a6 100644 --- a/docs/v12.0.0/features/distribution-metrics/index.html +++ b/docs/v12.0.0/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v12.0.0

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/v12.0.0/features/execution_payload_retention/index.html b/docs/v12.0.0/features/execution_payload_retention/index.html index 4bf7eea9e19..f56964c8119 100644 --- a/docs/v12.0.0/features/execution_payload_retention/index.html +++ b/docs/v12.0.0/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v12.0.0

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in your RDS database and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/features/logging-esdis-metrics/index.html b/docs/v12.0.0/features/logging-esdis-metrics/index.html index de76c2094f5..394db11a6b0 100644 --- a/docs/v12.0.0/features/logging-esdis-metrics/index.html +++ b/docs/v12.0.0/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v12.0.0

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/features/replay-archived-sqs-messages/index.html b/docs/v12.0.0/features/replay-archived-sqs-messages/index.html index e6a48e16ad7..ebc85cc0c8f 100644 --- a/docs/v12.0.0/features/replay-archived-sqs-messages/index.html +++ b/docs/v12.0.0/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: v12.0.0

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/features/replay-kinesis-messages/index.html b/docs/v12.0.0/features/replay-kinesis-messages/index.html index fe6bb08078f..04b5edcda32 100644 --- a/docs/v12.0.0/features/replay-kinesis-messages/index.html +++ b/docs/v12.0.0/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v12.0.0

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/features/reports/index.html b/docs/v12.0.0/features/reports/index.html index d9430509a3f..8e5d85d83db 100644 --- a/docs/v12.0.0/features/reports/index.html +++ b/docs/v12.0.0/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -19,7 +19,7 @@ report generation. The data buckets will include any buckets in your Cumulus buckets configuration that have type public, protected or private.
    - + \ No newline at end of file diff --git a/docs/v12.0.0/getting-started/index.html b/docs/v12.0.0/getting-started/index.html index 4a963da55c4..077e98231a3 100644 --- a/docs/v12.0.0/getting-started/index.html +++ b/docs/v12.0.0/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v12.0.0

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Deploy Cumulus and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    2. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    3. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment. For an introduction about Terraform go here.
    • Integrator Common Use Cases: Scenarios to help integrators along in the Cumulus environment.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/v12.0.0/glossary/index.html b/docs/v12.0.0/glossary/index.html index 473517c51b1..7aa31eb0605 100644 --- a/docs/v12.0.0/glossary/index.html +++ b/docs/v12.0.0/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: v12.0.0

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: aws.amazon.com

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see cookbook reference page.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Npm

    Node package manager.

    For more information, see npmjs.com.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (NPM)

    NPM hosted node.js packages. Cumulus packages can be found on NPM's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data and more.

    For more information, see AWS's s3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/index.html b/docs/v12.0.0/index.html index efeaf0de9dc..2ff44f8d597 100644 --- a/docs/v12.0.0/index.html +++ b/docs/v12.0.0/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v12.0.0

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/integrator-guide/about-int-guide/index.html b/docs/v12.0.0/integrator-guide/about-int-guide/index.html index 08d436ebd28..7ab2a9c614e 100644 --- a/docs/v12.0.0/integrator-guide/about-int-guide/index.html +++ b/docs/v12.0.0/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v12.0.0

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/v12.0.0/integrator-guide/int-common-use-cases/index.html b/docs/v12.0.0/integrator-guide/int-common-use-cases/index.html index 777fe6c4d7e..d07c1e34793 100644 --- a/docs/v12.0.0/integrator-guide/int-common-use-cases/index.html +++ b/docs/v12.0.0/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v12.0.0/integrator-guide/workflow-add-new-lambda/index.html b/docs/v12.0.0/integrator-guide/workflow-add-new-lambda/index.html index e3907bcd31d..0020dad8583 100644 --- a/docs/v12.0.0/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/v12.0.0/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v12.0.0

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/integrator-guide/workflow-ts-failed-step/index.html b/docs/v12.0.0/integrator-guide/workflow-ts-failed-step/index.html index 84ee5b265f5..58c2d495249 100644 --- a/docs/v12.0.0/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/v12.0.0/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v12.0.0

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/v12.0.0/interfaces/index.html b/docs/v12.0.0/interfaces/index.html index fa18c079c4e..176d531ed36 100644 --- a/docs/v12.0.0/interfaces/index.html +++ b/docs/v12.0.0/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v12.0.0

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/about-operator-docs/index.html b/docs/v12.0.0/operator-docs/about-operator-docs/index.html index ba073ec77d0..a81e85681c5 100644 --- a/docs/v12.0.0/operator-docs/about-operator-docs/index.html +++ b/docs/v12.0.0/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v12.0.0

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/bulk-operations/index.html b/docs/v12.0.0/operator-docs/bulk-operations/index.html index 51a4c2e2219..27e630cf9a2 100644 --- a/docs/v12.0.0/operator-docs/bulk-operations/index.html +++ b/docs/v12.0.0/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v12.0.0

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/cmr-operations/index.html b/docs/v12.0.0/operator-docs/cmr-operations/index.html index 730b3528940..00d66467d4f 100644 --- a/docs/v12.0.0/operator-docs/cmr-operations/index.html +++ b/docs/v12.0.0/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/create-rule-in-cumulus/index.html b/docs/v12.0.0/operator-docs/create-rule-in-cumulus/index.html index c7f17ea66b0..559643f2176 100644 --- a/docs/v12.0.0/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/v12.0.0/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v12.0.0

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/discovery-filtering/index.html b/docs/v12.0.0/operator-docs/discovery-filtering/index.html index ad61a6af062..9ade05a4a42 100644 --- a/docs/v12.0.0/operator-docs/discovery-filtering/index.html +++ b/docs/v12.0.0/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/granule-workflows/index.html b/docs/v12.0.0/operator-docs/granule-workflows/index.html index d2237a1d352..067513d49c3 100644 --- a/docs/v12.0.0/operator-docs/granule-workflows/index.html +++ b/docs/v12.0.0/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v12.0.0

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/kinesis-stream-for-ingest/index.html b/docs/v12.0.0/operator-docs/kinesis-stream-for-ingest/index.html index 5dca36a0bc1..e50ec2bdb9f 100644 --- a/docs/v12.0.0/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/v12.0.0/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v12.0.0

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/locating-access-logs/index.html b/docs/v12.0.0/operator-docs/locating-access-logs/index.html index 961c7f81f58..e34b70d42aa 100644 --- a/docs/v12.0.0/operator-docs/locating-access-logs/index.html +++ b/docs/v12.0.0/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v12.0.0

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/naming-executions/index.html b/docs/v12.0.0/operator-docs/naming-executions/index.html index d382bce4f6b..d85653f60d2 100644 --- a/docs/v12.0.0/operator-docs/naming-executions/index.html +++ b/docs/v12.0.0/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Please note: This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/ops-common-use-cases/index.html b/docs/v12.0.0/operator-docs/ops-common-use-cases/index.html index b3e41a49702..9e1752aafbb 100644 --- a/docs/v12.0.0/operator-docs/ops-common-use-cases/index.html +++ b/docs/v12.0.0/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v12.0.0/operator-docs/trigger-workflow/index.html b/docs/v12.0.0/operator-docs/trigger-workflow/index.html index 217952d15ae..fc8d3150e50 100644 --- a/docs/v12.0.0/operator-docs/trigger-workflow/index.html +++ b/docs/v12.0.0/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v12.0.0

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/tasks/index.html b/docs/v12.0.0/tasks/index.html index 8183860d12a..b929d6bebc9 100644 --- a/docs/v12.0.0/tasks/index.html +++ b/docs/v12.0.0/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v12.0.0

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    NOTE: For a detailed description of each task, visit the task's README.md. Information on the input or output of a task is specified in the task's schemas directory.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/v12.0.0/team/index.html b/docs/v12.0.0/team/index.html index 242539753fb..fbda226a41a 100644 --- a/docs/v12.0.0/team/index.html +++ b/docs/v12.0.0/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v12.0.0/troubleshooting/index.html b/docs/v12.0.0/troubleshooting/index.html index 15d7223d699..18790fa66e4 100644 --- a/docs/v12.0.0/troubleshooting/index.html +++ b/docs/v12.0.0/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v12.0.0

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/troubleshooting/reindex-elasticsearch/index.html b/docs/v12.0.0/troubleshooting/reindex-elasticsearch/index.html index a17f78c502a..058966354aa 100644 --- a/docs/v12.0.0/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/v12.0.0/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/troubleshooting/rerunning-workflow-executions/index.html b/docs/v12.0.0/troubleshooting/rerunning-workflow-executions/index.html index 0f73afbbb1c..8f601609886 100644 --- a/docs/v12.0.0/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/v12.0.0/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v12.0.0

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/v12.0.0/troubleshooting/troubleshooting-deployment/index.html b/docs/v12.0.0/troubleshooting/troubleshooting-deployment/index.html index da4ddb304e6..3bc12d44819 100644 --- a/docs/v12.0.0/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/v12.0.0/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/v12.0.0/upgrade-notes/cumulus_distribution_migration/index.html b/docs/v12.0.0/upgrade-notes/cumulus_distribution_migration/index.html index d9ab64abd31..7371f90453d 100644 --- a/docs/v12.0.0/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/v12.0.0/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: v12.0.0

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/upgrade-notes/migrate_tea_standalone/index.html b/docs/v12.0.0/upgrade-notes/migrate_tea_standalone/index.html index 340bd7f570e..e17990e5ad6 100644 --- a/docs/v12.0.0/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/v12.0.0/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v12.0.0

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/upgrade-notes/update-cma-2.0.2/index.html b/docs/v12.0.0/upgrade-notes/update-cma-2.0.2/index.html index b0561f8ebd6..830b3c5960d 100644 --- a/docs/v12.0.0/upgrade-notes/update-cma-2.0.2/index.html +++ b/docs/v12.0.0/upgrade-notes/update-cma-2.0.2/index.html @@ -5,13 +5,13 @@ Upgrade to CMA 2.0.2 | Cumulus Documentation - +
    Version: v12.0.0

    Upgrade to CMA 2.0.2

    Updating a Cumulus Deployment to CMA 2.0.2

    Background

    The Cumulus Message Adapter has been updated in release 2.0.2 to no longer utilize the AWS step function API to look up the defined name of a step function task for population in meta.workflow_tasks, but instead use an incrementing integer field.

    Additionally a bugfix was released in the form of v2.0.1/v2.0.2 following the initial 2.0.0 release, so all users should update to release 2.0.2

    The update is not tied to a particular version of Core, however the update should be done across all task components in order to ensure consistent execution records.

    Changes

    Execution Record Update

    This update functionally means that Cumulus tasks/activities using the CMA will now record a record that looks like the following in meta.workflowtasks, and more importantly in the tasks column for an execution record:

    Original

          "DiscoverGranules": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "QueueGranules": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    New

          "0": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "1": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    Actions Required

    The following should be done as part of a Cumulus stack update to utilize cumulus message adapter > 2.0.2:

    • Python tasks that utilize cumulus-message-adapter-python should be updated to use > 2.0.0, their lambdas rebuilt and Cumulus workflows reconfigured to use the updated version.

    • Python activities that utilize cumulus-process-py should be rebuilt using > 1.0.0 with updated dependencies, and have their images deployed/Cumulus configured to use the new version.

    • The cumulus-message-adapter v2.0.2 lambda layer should be made available in the deployment account, and the Cumulus deployment should be reconfigured to use it (via the cumulus_message_adapter_lambda_layer_version_arn variable in the cumulus module). This should address all Core node.js tasks that utilize the CMA, and many contributed node.js/JAVA components.

    Once the above have been done, redeploy Cumulus to apply the configuration and the updates should be live.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/upgrade-notes/update-task-file-schemas/index.html b/docs/v12.0.0/upgrade-notes/update-task-file-schemas/index.html index 601bcc9766a..42a89237572 100644 --- a/docs/v12.0.0/upgrade-notes/update-task-file-schemas/index.html +++ b/docs/v12.0.0/upgrade-notes/update-task-file-schemas/index.html @@ -5,13 +5,13 @@ Updates to task granule file schemas | Cumulus Documentation - +
    Version: v12.0.0

    Updates to task granule file schemas

    Background

    Most Cumulus workflow tasks expect as input a payload of granule(s) which contain the files for each granule. Most tasks also return this same granule structure as output.

    However, up to this point, there was inconsistency in the schemas for the granule files objects expected by each task. Furthermore, there was no guarantee of consistency between granule files objects as stored in the database and the expectations of any given workflow task.

    Thus, when performing bulk granule operations which pass granules from the database into a Cumulus workflow, it was possible for there to be schema validation failures depending on which task was used to start the workflow and its particular schema.

    In order to rectify this situation, CUMULUS-2388 was filed and addressed to create a common granule files schema between nearly all of the Cumulus tasks (exceptions discussed below) and the Cumulus database. The following documentation explains the manual changes you need to make to your deployment in order to be compatible with the updated files schema.

    Updated files schema

    The updated granule files schema can be found here.

    These former properties were deprecated (with notes about how to derive the same information from the updated schema, if possible):

    • filename - concatenate the bucket and key values with a directory separator (/)
    • name - use fileName property
    • etag - ETags are no longer provided as an individual file property. Instead, a separate etags object mapping S3 URIs to ETag values is provided as output from the following workflow tasks (guidance on how to integrate this output with your workflows is provided in the Upgrading your workflows section below):
      • update-granules-cmr-metadata-file-links
      • hyrax-metadata-updates
    • fileStagingDir - no longer supported
    • url_path - no longer supported
    • duplicate_found - This property is no longer supported, however sync-granule and move-granules now produce a separate granuleDuplicates object as part of their output. The granuleDuplicates object is a map of granules by granule ID which includes the files that encountered duplicates during processing. Guidance on how to integrate granuleDuplicates information into your workflow configuration is provided below.

    Exceptions

    These workflow tasks did not have their schema for granule files updated:

    • discover-granules - no updates
    • queue-granules - no updates
    • parse-pdr - no updates
    • sync-granule - input schema not updated, output schema was updated

    The reason that these task schemas were not updated is that all of these tasks start before the files have been ingested to S3, thus much of the information that is required in the updated files schema like bucket, key, or checksum is not yet known.

    Bulk granule operations

    Since the input schema for the above tasks was not updated, that means you cannot run bulk granule operations against workflows if they start with any of those tasks. Bulk granule operations work by loading the specified granules from the database and sending them as input to a specified workflow, so if the specified workflow begins with a task whose input schema does not conform to what is coming out of the database, there will be schema errors.

    Upgrading your deployment

    Upgrading your workflows

    For any workflows using the update-granules-cmr-metadata-file-links task before the hyrax-metadata-updates and/or post-to-cmr tasks, update the step definition for update-granules-cmr-metadata-file-links as follows:

        "UpdateGranulesCmrMetadataFileLinksStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    hyrax-metadata-updates

    For any workflows using the hyrax-metadata-updates task before a post-to-cmr task, update the definition of the hyrax-metadata-updates step as follows:

        "HyraxMetadataUpdatesTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    post-to-cmr

    For any workflows using post-to-cmr task after the update-granules-cmr-metadata-file-links or hyrax-metadata-updates tasks, update the post-to-cmr step definition as follows:

        "CmrStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}"
    }
    }
    },
    ...more configuration...

    Example workflow

    For an example workflow integrating all of these changes, please see our example ingest and publish workflow.

    Optional - Integrate granuleDuplicates information

    Please note that the granuleDuplicates output is purely informational and does not have any bearing on the separate configuration for how duplicates should be handled.

    You can include granuleDuplicates output from the sync-granule or move-granules tasks in your workflow messages like so:

        "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    ...other config...
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granuleDuplicates}",
    "destination": "{$.meta.sync_granule.granule_duplicates}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    }
    ...more configuration...

    The result of this configuration is that the granuleDuplicates output from sync-granule would be placed in meta.sync_granule.granule_duplicates on the workflow message and remain there throughout the rest of the workflow. The same configuration could be replicated for the move-granules task, but be sure to use a different destination in the workflow message for the granuleDuplicates output .

    Updating collection URL path templates

    Collections can specify url_path templates to dynamically generate the final location of files. As part of url_path templates, file object properties can be interpolated to generate the file path. Thus, these url_path templates need to be updated to ensure that they are compatible with the updated files schema and the properties that will actually be available on file objects.

    See the notes on the updated files schema to know which properties are available and which previously existing properties were deprecated.

    As an example, you will want to update any url_path properties in your collections to remove references to file.name and replace them with references to file.fileName like so:

    - "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.name, 0, 3)}",
    + "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.fileName, 0, 3)}",
    - + \ No newline at end of file diff --git a/docs/v12.0.0/upgrade-notes/upgrade-rds/index.html b/docs/v12.0.0/upgrade-notes/upgrade-rds/index.html index 5787a3390c1..316b65b900d 100644 --- a/docs/v12.0.0/upgrade-notes/upgrade-rds/index.html +++ b/docs/v12.0.0/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/v12.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/v12.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html index e8beaea52e8..9cb79587850 100644 --- a/docs/v12.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/v12.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v12.0.0

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the upgrade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflow_tasks/discover_granules/index.html b/docs/v12.0.0/workflow_tasks/discover_granules/index.html index bb791ef02fb..69949291aaa 100644 --- a/docs/v12.0.0/workflow_tasks/discover_granules/index.html +++ b/docs/v12.0.0/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflow_tasks/files_to_granules/index.html b/docs/v12.0.0/workflow_tasks/files_to_granules/index.html index 1ffb8c87366..368ecc80cc8 100644 --- a/docs/v12.0.0/workflow_tasks/files_to_granules/index.html +++ b/docs/v12.0.0/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v12.0.0

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • size
    • bucket
    • key
    • fileName

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflow_tasks/lzards_backup/index.html b/docs/v12.0.0/workflow_tasks/lzards_backup/index.html index 35d238c6b8b..196f1df7d8c 100644 --- a/docs/v12.0.0/workflow_tasks/lzards_backup/index.html +++ b/docs/v12.0.0/workflow_tasks/lzards_backup/index.html @@ -5,13 +5,13 @@ LZARDS Backup | Cumulus Documentation - +
    Version: v12.0.0

    LZARDS Backup

    The LZARDS backup task takes an array of granules and initiates backup requests to the LZARDS API, which will be handled asynchronously by LZARDS.

    Deployment

    The LZARDS backup task is not automatically deployed with Cumulus. To deploy the task through the Cumulus module, first you must specify a lzards_launchpad_passphrase in your terraform variables (e.g. variables.tf) like so:

    variable "lzards_launchpad_passphrase" {
    type = string
    default = ""
    }

    Then you can specify a value for your lzards_launchpad_passphrase in terraform.tfvars like so:

    lzards_launchpad_passphrase = your-passphrase

    Lastly, you need to make sure that the lzards_launchpad_passphrase is passed into the Cumulus module (in main.tf) like so:

    lzards_launchpad_passphrase  = var.lzards_launchpad_passphrase

    In short, deploying the LZARDS task requires configuring a passphrase variable and ensuring that your TF configuration passes that variable into the Cumulus module.

    Additional terraform configuration for the LZARDS task can be found in the cumulus module's variables.tf file, where the the relevant variables are prefixed with lzards_. You can add these variables to your deployment using the same process outlined above for lzards_launchpad_passphrase.

    Task Inputs

    Input

    This task expects an array of granules as input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Task Outputs

    Output

    The LZARDS task outputs a composite object containing:

    • the input granules array, and
    • a backupResults object that describes the results of LZARDS backup attempts.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflow_tasks/move_granules/index.html b/docs/v12.0.0/workflow_tasks/move_granules/index.html index fc09263c2b6..2a5026884c1 100644 --- a/docs/v12.0.0/workflow_tasks/move_granules/index.html +++ b/docs/v12.0.0/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v12.0.0

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflow_tasks/parse_pdr/index.html b/docs/v12.0.0/workflow_tasks/parse_pdr/index.html index b049ebca6b8..823def340ee 100644 --- a/docs/v12.0.0/workflow_tasks/parse_pdr/index.html +++ b/docs/v12.0.0/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v12.0.0

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflow_tasks/queue_granules/index.html b/docs/v12.0.0/workflow_tasks/queue_granules/index.html index 24248a0be6e..22ecdf174f9 100644 --- a/docs/v12.0.0/workflow_tasks/queue_granules/index.html +++ b/docs/v12.0.0/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: v12.0.0

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflows/cumulus-task-message-flow/index.html b/docs/v12.0.0/workflows/cumulus-task-message-flow/index.html index 38bf747dee2..80a8895004a 100644 --- a/docs/v12.0.0/workflows/cumulus-task-message-flow/index.html +++ b/docs/v12.0.0/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v12.0.0

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflows/developing-a-cumulus-workflow/index.html b/docs/v12.0.0/workflows/developing-a-cumulus-workflow/index.html index 67b9115cedd..40bb008c6da 100644 --- a/docs/v12.0.0/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/v12.0.0/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v12.0.0

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflows/developing-workflow-tasks/index.html b/docs/v12.0.0/workflows/developing-workflow-tasks/index.html index 5a68d1fd2af..0964d0e5c2c 100644 --- a/docs/v12.0.0/workflows/developing-workflow-tasks/index.html +++ b/docs/v12.0.0/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v12.0.0

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflows/docker/index.html b/docs/v12.0.0/workflows/docker/index.html index d349c39a506..a0a25a12cc6 100644 --- a/docs/v12.0.0/workflows/docker/index.html +++ b/docs/v12.0.0/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflows/index.html b/docs/v12.0.0/workflows/index.html index 8e63d8d39e8..b69fb844d4e 100644 --- a/docs/v12.0.0/workflows/index.html +++ b/docs/v12.0.0/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v12.0.0

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflows/input_output/index.html b/docs/v12.0.0/workflows/input_output/index.html index a657e9429da..11d710095d8 100644 --- a/docs/v12.0.0/workflows/input_output/index.html +++ b/docs/v12.0.0/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v12.0.0

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflows/lambda/index.html b/docs/v12.0.0/workflows/lambda/index.html index 8f86a9a20bf..8ff5c0387d2 100644 --- a/docs/v12.0.0/workflows/lambda/index.html +++ b/docs/v12.0.0/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v12.0.0

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflows/protocol/index.html b/docs/v12.0.0/workflows/protocol/index.html index 711307688b7..bd0867caaf5 100644 --- a/docs/v12.0.0/workflows/protocol/index.html +++ b/docs/v12.0.0/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v12.0.0

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflows/workflow-configuration-how-to/index.html b/docs/v12.0.0/workflows/workflow-configuration-how-to/index.html index 3914c717f82..c530d381fc0 100644 --- a/docs/v12.0.0/workflows/workflow-configuration-how-to/index.html +++ b/docs/v12.0.0/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -24,7 +24,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.fileName, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/v12.0.0/workflows/workflow-triggers/index.html b/docs/v12.0.0/workflows/workflow-triggers/index.html index 4c39a74b525..afde33620d8 100644 --- a/docs/v12.0.0/workflows/workflow-triggers/index.html +++ b/docs/v12.0.0/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v12.0.0

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/v13.0.0/adding-a-task/index.html b/docs/v13.0.0/adding-a-task/index.html index 63517c40d6e..72e7b26d71d 100644 --- a/docs/v13.0.0/adding-a-task/index.html +++ b/docs/v13.0.0/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: v13.0.0

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/api/index.html b/docs/v13.0.0/api/index.html index 2e2ec5c70e3..d6c58447ab3 100644 --- a/docs/v13.0.0/api/index.html +++ b/docs/v13.0.0/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v13.0.0/architecture/index.html b/docs/v13.0.0/architecture/index.html index 676464fd56e..43f0f764e76 100644 --- a/docs/v13.0.0/architecture/index.html +++ b/docs/v13.0.0/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: v13.0.0

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a set of PostgreSQL compatible database, and is exported to an Elasticsearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries. Currently the entity state data is replicated in DynamoDB and this will be removed in a future release.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Database

    Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance. For legacy reasons, Cumulus is currently also deploying a DynamoDB datastore, and writes are replicated in parallel with the PostgreSQL database writes. The DynamoDB replicated tables and parallel writes will be removed in future releases.

    PostgreSQL Database Schema Diagram

    ERD of the Cumulus Database

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/v13.0.0/configuration/cloudwatch-retention/index.html b/docs/v13.0.0/configuration/cloudwatch-retention/index.html index 20134cc5f24..cdf7f9574f1 100644 --- a/docs/v13.0.0/configuration/cloudwatch-retention/index.html +++ b/docs/v13.0.0/configuration/cloudwatch-retention/index.html @@ -5,13 +5,13 @@ Cloudwatch Retention | Cumulus Documentation - +
    Version: v13.0.0

    Cloudwatch Retention

    Our lambdas dump logs to AWS CloudWatch. By default, these logs exist indefinitely. However, there are ways to specify a duration for log retention.

    aws-cli

    In addition to getting your aws-cli set-up, there are two values you'll need to acquire.

    1. log-group-name: the name of the log group who's retention policy (retention time) you'd like to change. We'll use /aws/lambda/KinesisInboundLogger in our examples.
    2. retention-in-days: the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    For example, if we wanted to set log retention to 30 days on our KinesisInboundLogger lambda, we would write:

    aws logs put-retention-policy --log-group-name "/aws/lambda/KinesisInboundLogger" --retention-in-days 30

    Note: The aws-cli log command that we're using is explained in detail here.

    AWS Management Console

    Changing the log retention policy in the AWS Management Console is a fairly simple process:

    1. Navigate to the CloudWatch service in the AWS Management Console.
    2. Click on the Logs entry on the sidebar.
    3. Find the Log Group who's retention policy you're interested in changing.
    4. Click on the value in the Expire Events After column.
    5. Enter/Select the number of days you'd like to retain logs in that log group for.

    Screenshot of AWS console showing how to configure the retention period for Cloudwatch logs

    - + \ No newline at end of file diff --git a/docs/v13.0.0/configuration/collection-storage-best-practices/index.html b/docs/v13.0.0/configuration/collection-storage-best-practices/index.html index da434891ef4..376b178b8c1 100644 --- a/docs/v13.0.0/configuration/collection-storage-best-practices/index.html +++ b/docs/v13.0.0/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: v13.0.0

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/v13.0.0/configuration/data-management-types/index.html b/docs/v13.0.0/configuration/data-management-types/index.html index 5f6339453ca..954a0dae880 100644 --- a/docs/v13.0.0/configuration/data-management-types/index.html +++ b/docs/v13.0.0/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: v13.0.0

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    Please note:

    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certificateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certiciateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22
    privateKeystringNofilename assumed to be in s3://bucketInternal/stackName/crypto
    cmKeyIdstringNoAWS KMS Customer Master Key arn or alias

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.fileName, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.fileName, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v13.0.0/configuration/lifecycle-policies/index.html b/docs/v13.0.0/configuration/lifecycle-policies/index.html index 235c727e213..6908d3c8188 100644 --- a/docs/v13.0.0/configuration/lifecycle-policies/index.html +++ b/docs/v13.0.0/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: v13.0.0

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walk-through on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/v13.0.0/configuration/monitoring-readme/index.html b/docs/v13.0.0/configuration/monitoring-readme/index.html index 372a6b15826..385ae08e9f5 100644 --- a/docs/v13.0.0/configuration/monitoring-readme/index.html +++ b/docs/v13.0.0/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: v13.0.0

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return await Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/configuration/server_access_logging/index.html b/docs/v13.0.0/configuration/server_access_logging/index.html index 8ed7b9805c5..fee6d35a8ec 100644 --- a/docs/v13.0.0/configuration/server_access_logging/index.html +++ b/docs/v13.0.0/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: v13.0.0

    S3 Server Access Logging

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/v13.0.0/configuration/task-configuration/index.html b/docs/v13.0.0/configuration/task-configuration/index.html index 00d96b4ae83..896eba6eec8 100644 --- a/docs/v13.0.0/configuration/task-configuration/index.html +++ b/docs/v13.0.0/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
    Version: v13.0.0

    Configuration of Tasks

    The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

    cmr_search_client_config

    Configuration parameters for CMR search client for cumulus archive module tasks in the form:

    <lambda_identifier>_report_cmr_limit = <maximum number records can be returned from cmr-client search, this should be greater than cmr_page_size>
    <lambda_identifier>_report_cmr_page_size = <number of records for each page returned from CMR>
    type = map(string)

    More information about cmr limit and cmr page_size can be found from @cumulus/cmr-client and CMR Search API document.

    Currently the following values are supported:

    • create_reconciliation_report_cmr_limit
    • create_reconciliation_report_cmr_page_size

    Example

    cmr_search_client_config = {
    create_reconciliation_report_cmr_limit = 2500
    create_reconciliation_report_cmr_page_size = 250
    }

    elasticsearch_client_config

    Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

    <lambda_identifier>_es_scroll_duration = <duration>
    <lambda_identifier>_es_scroll_size = <size>
    type = map(string)

    Currently the following values are supported:

    • create_reconciliation_report_es_scroll_duration
    • create_reconciliation_report_es_scroll_size

    Example

    elasticsearch_client_config = {
    create_reconciliation_report_es_scroll_duration = "15m"
    create_reconciliation_report_es_scroll_size = 2000
    }

    lambda_timeouts

    A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_timeout: <timeout>
    type = map(string)

    Currently the following values are supported:

    • discover_granules_task_timeout
    • discover_pdrs_task_timeout
    • hyrax_metadata_update_tasks_timeout
    • lzards_backup_task_timeout
    • move_granules_task_timeout
    • parse_pdr_task_timeout
    • pdr_status_check_task_timeout
    • post_to_cmr_task_timeout
    • queue_granules_task_timeout
    • queue_pdrs_task_timeout
    • queue_workflow_task_timeout
    • sync_granule_task_timeout
    • update_granules_cmr_metadata_file_links_task_timeout

    Example

    lambda_timeouts = {
    discover_granules_task_timeout = 300
    }
    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/about-cookbooks/index.html b/docs/v13.0.0/data-cookbooks/about-cookbooks/index.html index 2532fb09a15..cc95598af3e 100644 --- a/docs/v13.0.0/data-cookbooks/about-cookbooks/index.html +++ b/docs/v13.0.0/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: v13.0.0

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to configure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/browse-generation/index.html b/docs/v13.0.0/data-cookbooks/browse-generation/index.html index 5730096f355..e6ab6f9c8c1 100644 --- a/docs/v13.0.0/data-cookbooks/browse-generation/index.html +++ b/docs/v13.0.0/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg"
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/choice-states/index.html b/docs/v13.0.0/data-cookbooks/choice-states/index.html index da933f3a1f6..761ad806ca3 100644 --- a/docs/v13.0.0/data-cookbooks/choice-states/index.html +++ b/docs/v13.0.0/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: v13.0.0

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/cnm-workflow/index.html b/docs/v13.0.0/data-cookbooks/cnm-workflow/index.html index f9115b99ade..dcdf8186819 100644 --- a/docs/v13.0.0/data-cookbooks/cnm-workflow/index.html +++ b/docs/v13.0.0/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v13.0.0

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/kinesis_trigger_test_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }

    Please Note:

    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

    Please note: The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/error-handling/index.html b/docs/v13.0.0/data-cookbooks/error-handling/index.html index f585c28c0ec..c32386fe162 100644 --- a/docs/v13.0.0/data-cookbooks/error-handling/index.html +++ b/docs/v13.0.0/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/hello-world/index.html b/docs/v13.0.0/data-cookbooks/hello-world/index.html index e6ba2edafac..1e93863b882 100644 --- a/docs/v13.0.0/data-cookbooks/hello-world/index.html +++ b/docs/v13.0.0/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: v13.0.0

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/ingest-notifications/index.html b/docs/v13.0.0/data-cookbooks/ingest-notifications/index.html index a1d22354a99..504018b1daa 100644 --- a/docs/v13.0.0/data-cookbooks/ingest-notifications/index.html +++ b/docs/v13.0.0/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: v13.0.0

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics, one for executions, granules, and PDRs, are created and used for handling notification messages related to the workflow.

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates the RDS database records for granules, executions, and PDRs. When the records are updated, messages are posted to the three SNS topics. This Lambda is invoked both when the workflow starts and when it reaches a terminal state (completion or failure).

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/queue-post-to-cmr/index.html b/docs/v13.0.0/data-cookbooks/queue-post-to-cmr/index.html index 7382bcc473b..67811d0f02f 100644 --- a/docs/v13.0.0/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/v13.0.0/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: v13.0.0

    Queue PostToCmr

    In this document, we walk through handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/v13.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index c8142b6ff5d..1dcbbb0aa73 100644 --- a/docs/v13.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/v13.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: v13.0.0

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.7.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn,
    "--lastModified",
    module.cumulus.queue_granules_task.last_modified_date
    ]
    alarms = {
    MemoryUtilizationHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 75
    }
    }
    }

    Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/sips-workflow/index.html b/docs/v13.0.0/data-cookbooks/sips-workflow/index.html index d31b67a467f..e4101b00a8e 100644 --- a/docs/v13.0.0/data-cookbooks/sips-workflow/index.html +++ b/docs/v13.0.0/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/throttling-queued-executions/index.html b/docs/v13.0.0/data-cookbooks/throttling-queued-executions/index.html index 0df34b1afc6..02d740f5ddc 100644 --- a/docs/v13.0.0/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/v13.0.0/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: v13.0.0

    Throttling queued executions

    In this entry, we will walk through how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/v13.0.0/data-cookbooks/tracking-files/index.html b/docs/v13.0.0/data-cookbooks/tracking-files/index.html index e7eb016489b..4c88a78d533 100644 --- a/docs/v13.0.0/data-cookbooks/tracking-files/index.html +++ b/docs/v13.0.0/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/api-gateway-logging/index.html b/docs/v13.0.0/deployment/api-gateway-logging/index.html index 286b0106fb6..5bd37929fa0 100644 --- a/docs/v13.0.0/deployment/api-gateway-logging/index.html +++ b/docs/v13.0.0/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: v13.0.0

    API Gateway Logging

    Enabling API Gateway logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions for enabling account level logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    Create a policy document

    The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

    Save this snippet as apigateway-policy.json.

    {
    "Version": "2012-10-17",
    "Statement": [
    {
    "Sid": "",
    "Effect": "Allow",
    "Principal": {
    "Service": "apigateway.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
    }
    ]
    }

    Create an account role to act as ApiGateway and write to CloudWatchLogs

    NASA users in NGAP: be sure to use your account's permission boundary.

    aws iam create-role \
    --role-name ApiGatewayToCloudWatchLogs \
    [--permissions-boundary <permissionBoundaryArn>] \
    --assume-role-policy-document file://apigateway-policy.json

    Note the ARN of the returned role for the last step.

    Attach correct permissions to role

    Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

    aws iam attach-role-policy \
    --role-name ApiGatewayToCloudWatchLogs \
    --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"

    Update Account API Gateway settings with correct permissions

    Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

    aws apigateway update-account \
    --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    See Configure Cloudwatch Logs Delivery

    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/choosing_configuring_rds/index.html b/docs/v13.0.0/deployment/choosing_configuring_rds/index.html index 5503479b286..32510a3c544 100644 --- a/docs/v13.0.0/deployment/choosing_configuring_rds/index.html +++ b/docs/v13.0.0/deployment/choosing_configuring_rds/index.html @@ -5,7 +5,7 @@ Choosing and configuration your RDS database | Cumulus Documentation - + @@ -37,7 +37,7 @@ using this module to create your RDS cluster, you can configure the autoscaling timeout action, the cluster minimum and maximum capacity, and more as seen in the supported variables for the module.

    Unfortunately, Terraform currently doesn't allow specifying the autoscaling timeout itself, so that value will have to be manually configured in the AWS console or CLI.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/cloudwatch-logs-delivery/index.html b/docs/v13.0.0/deployment/cloudwatch-logs-delivery/index.html index e7d06c54f42..4f9c20f26dc 100644 --- a/docs/v13.0.0/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/v13.0.0/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: v13.0.0

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    Be default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/components/index.html b/docs/v13.0.0/deployment/components/index.html index 07e2561d56a..651849f0a36 100644 --- a/docs/v13.0.0/deployment/components/index.html +++ b/docs/v13.0.0/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walk-through of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/create_bucket/index.html b/docs/v13.0.0/deployment/create_bucket/index.html index 6d9a0f589e2..0136772bb49 100644 --- a/docs/v13.0.0/deployment/create_bucket/index.html +++ b/docs/v13.0.0/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: v13.0.0

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command line

    Using the AWS command line tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }

    Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web interface

    See: AWS "Creating a Bucket" documentation

    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/cumulus_distribution/index.html b/docs/v13.0.0/deployment/cumulus_distribution/index.html index 9ce1b8cc27f..36130f86aca 100644 --- a/docs/v13.0.0/deployment/cumulus_distribution/index.html +++ b/docs/v13.0.0/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
    Version: v13.0.0

    Using the Cumulus Distribution API

    The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

    Configuring a Cumulus Distribution deployment

    The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

    These steps assume you're using the Cumulus Deployment Template but can also be used for custom deployments.

    To configure a deployment to use Cumulus Distribution:

    1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the Cumulus Distribution settings.
    2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
    3. Uncomment the Cumulus Distribution outputs in outputs.tf.
    4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

    Cognito Application and User Credentials

    The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

    Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

    Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

    • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
    • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
    • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

    Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

    Cumulus Distribution URL

    Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

    By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

    In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

    You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file, and setting it to one of the following values (both of which are explained below):

    1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
    2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

    The following subsections explain these approaches, in turn.

    Using your Cumulus Distribution API Gateway URL as your distribution URL

    Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development), as described here. Here is an outline of the required steps, with details provided further below:

    1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
    2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
    3. Choose an open local port on your machine (we'll use 9000 in the following details)
    4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
    5. Redeploy Cumulus
    6. Add an entry to your /etc/hosts file
    7. Add a redirect URI to Cognito, via the Cognito API
    8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
    9. Add a sample file to S3 to test downloading via Cognito

    To create or import an existing key pair, you can use the AWS CLI (see aws ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

    Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

    key_name = "<name>"
    cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

    where:

    • <name> is the name of the key pair you just added to AWS
    • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
    • <port> is your open local port of choice (9000 is typically a good choice)

    Once you save your variable changes, redeploy your cumulus-tf module.

    While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

    localhost <hostname>

    Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

    • method: POST
    • base URL: the value of your csdap_host_url Terraform variable
    • path: /authclient/updateRedirectUri
    • username: the value of your csdap_client_id Terraform variable
    • password: the value of your csdap_client_password Terraform variable
    • headers: Content-Type='application/x-www-form-urlencoded'
    • body: redirect_uri=<cumulus_distribution_url>/login

    where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

    For reference, see the Cognito Authentication Service API.

    Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

    brew install --cask session-manager-plugin --no-quarantine

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

    1. Determine your ec2 instance ID
    2. Connect to the NASA VPN
    3. Start an AWS SSM session
    4. Open an ssh tunnel
    5. Use a browser to navigate to your file

    To determine your ec2 instance ID for your Cumulus deployment, run the follow command, where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

    aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text

    IMPORTANT: Before proceeding with the remaining steps, make sure you're connected to the NASA VPN.

    Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

    aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

    If successful, you should see output similar to the following:

    Starting session with SessionId: NGAPShApplicationDeveloper-***
    Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
    Waiting for connections...

    Open another terminal window, and open a tunnel with port forwarding, using your chosen port from above (e.g., 9000):

    ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

    where:

    • <port> is the open local port you chose earlier (e.g., 9000)
    • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    Once you're finished testing, clean up as follows:

    1. Kill your ssh tunnel (Ctrl-C)
    2. Kill your AWS SSM session (Ctrl-C)
    3. If you like, disconnect from the NASA VPC

    While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

    Using a CloudFront URL as your distribution URL

    In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

    This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

    • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
    • A request to add the AWS account's VPC to the whitelist

    Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

    cumulus_distribution_url = <cloudfront_url>

    In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

    At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

    Redeploy Cumulus with your new/updated Terraform variables.

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    S3 Bucket Mapping

    An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

    The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Note: Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Switching from the Thin Egress App to Cumulus Distribution

    If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

    Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

    There will also be downtime while your API gateway is updated.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/index.html b/docs/v13.0.0/deployment/index.html index 14c15559a77..9f38ebf04c6 100644 --- a/docs/v13.0.0/deployment/index.html +++ b/docs/v13.0.0/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -21,7 +21,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Choose a distribution API

    Cumulus can be configured to use either the Thin Egress App (TEA) or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

    IMPORTANT! If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    Configure the Thin Egress App

    The Thin Egress App can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the documentation to configure distribution in your cumulus-tf deployment.

    Configure the Cumulus Distribution API (optional)

    If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

    Note: Be sure to copy the redirect URLs, as you will use them to update your Earthdata application.

    Update Earthdata Application

    You will need to add two redirect URLs to your EarthData login application.

    1. Login to URS.
    2. Under My Applications -> Application Administration -> use the edit icon of your application.
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token.
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1.
    5. You may delete the placeholder url you used to create the application.

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus dashboard

    Dashboard Requirements

    Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the dashboard repository.

    Prepare AWS

    Create S3 bucket for dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install dashboard

    To install the dashboard, clone the Cumulus dashboard repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard versioning

    By default, the master branch will be used for dashboard deployments. The master branch of the dashboard repo contains the most recent stable release of the dashboard.

    If you want to test unreleased changes to the dashboard, use the develop branch.

    Each release/version of the dashboard will have a tag in the dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the dashboard

    Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build the dashboard from the dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard deployment

    Deploy dashboard to s3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard --acl public-read

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and login with a user that you configured for access in the Configure and Deploy the Cumulus Stack step.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 instances and autoscaling groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/postgres_database_deployment/index.html b/docs/v13.0.0/deployment/postgres_database_deployment/index.html index f75ce1b5b2d..32feca0e0fe 100644 --- a/docs/v13.0.0/deployment/postgres_database_deployment/index.html +++ b/docs/v13.0.0/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 10.2 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases, and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs, and the Aurora Serverless V1 docs.

    Prepare deployment repository

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a new repository

    Create a new repository on Github so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

    --

    Configure and deploy the module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc. This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision user and user database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true, this configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to
    • prefix -- this value will be used to set a unique identifier the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda, and run it on each provision, creating the configured database if it does not exist, updating the user password if that value has been changed, and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    Please Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see output like:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secret required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are is in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Next Steps

    The database cluster has been created/updated! From here you can continue to add additional user accounts, databases and other database configuration.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/share-s3-access-logs/index.html b/docs/v13.0.0/deployment/share-s3-access-logs/index.html index fe04988d2bb..e3f6f8c5e05 100644 --- a/docs/v13.0.0/deployment/share-s3-access-logs/index.html +++ b/docs/v13.0.0/deployment/share-s3-access-logs/index.html @@ -5,14 +5,14 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: v13.0.0

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a node package that contains a simple lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First ensure that you have enabled S3 Server Access Logging.

    Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition. e.g.

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The terraform source package can be found on the Cumulus github release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 replicator as described above using the target_bucket and target_prefix provided by the metrics team.

    The metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/terraform-best-practices/index.html b/docs/v13.0.0/deployment/terraform-best-practices/index.html index 6088bf360c2..8fad2f15d4c 100644 --- a/docs/v13.0.0/deployment/terraform-best-practices/index.html +++ b/docs/v13.0.0/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    Configuring the Cumulus deployment: link Restoring a previous version: link

    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/thin_egress_app/index.html b/docs/v13.0.0/deployment/thin_egress_app/index.html index e15907fb0cb..458e29d43ab 100644 --- a/docs/v13.0.0/deployment/thin_egress_app/index.html +++ b/docs/v13.0.0/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v13.0.0

    Using the Thin Egress App for Cumulus distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a secret for signing Thin Egress App JWTs

    The Thin Egress App uses JWTs internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Please note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally configure a custom bucket map

    A simple config would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public

    Please note: your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally configure shared variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus core example deployment code.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/deployment/upgrade-readme/index.html b/docs/v13.0.0/deployment/upgrade-readme/index.html index 9669405f4a9..d1eab5a3709 100644 --- a/docs/v13.0.0/deployment/upgrade-readme/index.html +++ b/docs/v13.0.0/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/development/forked-pr/index.html b/docs/v13.0.0/development/forked-pr/index.html index 2dd653111c4..6d3cc7b2160 100644 --- a/docs/v13.0.0/development/forked-pr/index.html +++ b/docs/v13.0.0/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: v13.0.0

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/development/integration-tests/index.html b/docs/v13.0.0/development/integration-tests/index.html index 9e17652bffa..074e53c93d3 100644 --- a/docs/v13.0.0/development/integration-tests/index.html +++ b/docs/v13.0.0/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/development/quality-and-coverage/index.html b/docs/v13.0.0/development/quality-and-coverage/index.html index 53cccf6db57..251df7c9703 100644 --- a/docs/v13.0.0/development/quality-and-coverage/index.html +++ b/docs/v13.0.0/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/development/release/index.html b/docs/v13.0.0/development/release/index.html index a014dfeed7b..0273519365e 100644 --- a/docs/v13.0.0/development/release/index.html +++ b/docs/v13.0.0/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -15,7 +15,7 @@ It's useful to use the search feature of your code editor or grep to see if there any references to the old package versions. In bash shell you can run

    find . -name package.json -exec grep -nH "@cumulus/.*MAJOR\.MINOR\.PATCH.*" {} \;

    Verify that each of those is updated to the new MAJOR.MINOR.PATCH verion you are trying to release.

    A similar search for alpha and beta versions should be run on the release version and any problems should be fixed.

    find . -name package.json -exec grep -nHE "MAJOR\.MINOR\.PATCH.*(alpha|beta)" {} \;

    3. Check Cumulus Dashboard PRs for Version Bump

    There may be unreleased changes in the Cumulus Dashboard project that rely on this unreleased Cumulus Core version.

    If there is exists a PR in the cumulus-dashboard repo with a name containing: "Version Bump for Next Cumulus API Release":

    • There will be a placeholder change-me value that should be replaced with the Cumulus Core to-be-released-version.
    • Mark that PR as ready to be reviewed.

    4. Update CHANGELOG.md

    Update the CHANGELOG.md. Put a header under the Unreleased section with the new version number and the date.

    Add a link reference for the github "compare" view at the bottom of the CHANGELOG.md, following the existing pattern. This link reference should create a link in the CHANGELOG's release header to changes in the corresponding release.

    5. Update DATA_MODEL_CHANGELOG.md

    Similar to #4, make sure the DATA_MODEL_CHANGELOG is updated if there are data model changes in the release, and the link reference at the end of the document is updated as appropriate.

    6. Update CONTRIBUTORS.md

    ./bin/update-contributors.sh
    git add CONTRIBUTORS.md

    Commit and push these changes, if any.

    7. Update Cumulus package API documentation

    Update auto-generated API documentation for any Cumulus packages that have it:

    npm run docs-build-packages

    Commit and push these changes, if any.

    8. Cut new version of Cumulus Documentation

    If this is a backport, do not create a new version of the documentation. For various reasons, we do not merge backports back to master, other than changelog notes. Documentation changes for backports will not be published to our documentation website.

    cd website
    npm run version ${release_version}
    git add .

    Where ${release_version} corresponds to the version tag v1.2.3, for example.

    Commit and push these changes.

    9. Create a pull request against the minor version branch

    1. Push the release branch (e.g. release-1.2.3) to GitHub.

    2. Create a PR against the minor version base branch (e.g. release-1.2.x).

    3. Configure Bamboo to run automated tests against this PR by finding the branch plan for the release branch (release-1.2.3) and setting only these variables:

      • GIT_PR: true
      • SKIP_AUDIT: true

      IMPORTANT: Do NOT set the PUBLISH_FLAG variable to true for this branch plan. The actual publishing of the release will be handled by a separate, manually triggered branch plan.

      Screenshot of Bamboo CI interface showing the configuration of the GIT_PR branch variable to have a value of &quot;true&quot;

    4. Verify that the Bamboo build for the PR succeeds and then merge to the minor version base branch (release-1.2.x).

      • It is safe to do a squash merge in this instance, but not required
    5. You may delete your release branch (release-1.2.3) after merging to the base branch.

    10. Create a git tag for the release

    Check out the minor version base branch (release-1.2.x) now that your changes are merged in and do a git pull.

    Ensure you are on the latest commit.

    Create and push a new git tag:

        git tag -a vMAJOR.MINOR.PATCH -m "Release MAJOR.MINOR.PATCH"
    git push origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -a v9.1.0 -m "Release 9.1.0"
    git push origin v9.1.0

    11. Publishing the release

    Publishing of new releases is handled by a custom Bamboo branch plan and is manually triggered.

    The reasons for using a separate branch plan to handle releases instead of the branch plan for the minor version (e.g. release-1.2.x) are:

    • The Bamboo build for the minor version release branch is triggered automatically on any commits to that branch, whereas we want to manually control when the release is published.
    • We want to verify that integration tests have passed on the Bamboo build for the minor version release branch before we manually trigger the release, so that we can be sure that our code is safe to release.

    If this is a new minor version branch, then you will need to create a new Bamboo branch plan for publishing the release following the instructions below:

    Creating a Bamboo branch plan for the release

    • In the Cumulus Core project (https://ci.earthdata.nasa.gov/browse/CUM-CBA), click Actions -> Configure Plan in the top right.

    • Next to Plan branch click the rightmost button that displays Create Plan Branch upon hover.

    • Click Create plan branch manually.

    • Add the values in that list. Choose a display name that makes it very clear this is a deployment branch plan. Release (minor version branch name) seems to work well (e.g. Release (1.2.x))).

      • Make sure you enter the correct branch name (e.g. release-1.2.x).
    • Important Deselect Enable Branch - if you do not do this, it will immediately fire off a build.

    • Do Immediately On the Branch Details page, enable Change trigger. Set the Trigger type to manual, this will prevent commits to the branch from triggering the build plan. You should have been redirected to the Branch Details tab after creating the plan. If not, navigate to the branch from the list where you clicked Create Plan Branch in the previous step.

    • Go to the Variables tab. Ensure that you are on your branch plan and not the master plan: You should not see a large list of configured variables, but instead a dropdown allowing you to select variables to override, and the tab title will be Branch Variables. Then set the branch variables as follow:

      • DEPLOYMENT: cumulus-from-npm-tf (except in special cases such as incompatible backport branches)
        • If this variable is not set, it will default to the deployment name for the last committer on the branch
      • USE_CACHED_BOOTSTRAP: false
      • USE_TERRAFORM_ZIPS: true (IMPORTANT: MUST be set in order to run integration tests against the .zip files published during the build so that we are actually testing our released files)
      • GIT_PR: true
      • SKIP_AUDIT: true
      • PUBLISH_FLAG: true
    • Enable the branch from the Branch Details page.

    • Run the branch using the Run button in the top right.

    Bamboo will build and run lint and unit tests against that tagged release, publish the new packages to NPM, and then run the integration tests using those newly released packages.

    12. Create a new Cumulus release on github

    The CI release scripts will automatically create a GitHub release based on the release version tag, as well as upload artifacts to the Github release for the Terraform modules provided by Cumulus. The Terraform release artifacts include:

    • A multi-module Terraform .zip artifact containing filtered copies of the tf-modules, packages, and tasks directories for use as Terraform module sources.
    • A S3 replicator module
    • A workflow module
    • A distribution API module
    • An ECS service module

    Just make sure to verify the appropriate .zip files are present on Github after the release process is complete.

    13. Merge base branch back to master

    Finally, you need to reproduce the version update changes back to master.

    If this is the latest version, you can simply create a PR to merge the minor version base branch back to master.

    Do not merge master back into the release branch since we want the release branch to just have the code from the release. Instead, create a new branch off of the release branch and merge that to master. You can freely merge master into this branch and delete it when it is merged to master.

    If this is a backport, you will need to create a PR that ports the changelog updates back to master. It is important in this changelog note to call it out as a backport. For example, fixes in backport version 1.14.5 may not be available in 1.15.0 because the fix was introduced in 1.15.3.

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d vMAJOR.MINOR.PATCH
    git push -d origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -d v9.1.0
    git push -d origin v9.1.0
    - + \ No newline at end of file diff --git a/docs/v13.0.0/docs-how-to/index.html b/docs/v13.0.0/docs-how-to/index.html index fb9e1b90ac2..e617c57d87e 100644 --- a/docs/v13.0.0/docs-how-to/index.html +++ b/docs/v13.0.0/docs-how-to/index.html @@ -5,13 +5,13 @@ Cumulus Documentation: How To's | Cumulus Documentation - +
    Version: v13.0.0

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve

    Note: docs-build will build the documents into website/build.

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---

    Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walk-through can be found here. It is worth noting that we would like the Documentation versions to match up directly with release versions. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiKey and an indexName by DocSearch that we include in our website/siteConfig.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for both these values to exist - DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/external-contributions/index.html b/docs/v13.0.0/external-contributions/index.html index b2718f43ea7..3b6884cd6d7 100644 --- a/docs/v13.0.0/external-contributions/index.html +++ b/docs/v13.0.0/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: v13.0.0

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/faqs/index.html b/docs/v13.0.0/faqs/index.html index 0bfd8a4da49..f0da360a0c7 100644 --- a/docs/v13.0.0/faqs/index.html +++ b/docs/v13.0.0/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: v13.0.0

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    What prerequisites are needed to setup Cumulus?

    Answer: You will need access to the AWS console and an Earthdata login before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a JIRA issue.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    Is Cumulus customizable and flexible?

    Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

    What are Terraform modules?

    Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

    Where do I find Terraform module variables

    Answer: Go here for a list of Cumulus maintained variables.

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting here.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/features/ancillary_metadata/index.html b/docs/v13.0.0/features/ancillary_metadata/index.html index 64fb4184aa6..ebdbe708e35 100644 --- a/docs/v13.0.0/features/ancillary_metadata/index.html +++ b/docs/v13.0.0/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v13.0.0

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/features/backup_and_restore/index.html b/docs/v13.0.0/features/backup_and_restore/index.html index e37e1a1922f..ae218fada46 100644 --- a/docs/v13.0.0/features/backup_and_restore/index.html +++ b/docs/v13.0.0/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -52,7 +52,7 @@ writing to the old cluster.

  • Set the snapshot_identifier variable to the snapshot you wish to create, and configure the module like a new deployment, with a unique cluster_identifier

  • Deploy the module using terraform apply

  • Once deployed, verify the cluster has the expected data

  • Redeploy the data persistence and Cumulus deployments - You should not need to reconfigure either, as the secret ARN and the security group should not change, however double-check the configured values are as expected

  • - + \ No newline at end of file diff --git a/docs/v13.0.0/features/dead_letter_archive/index.html b/docs/v13.0.0/features/dead_letter_archive/index.html index 24604f0fea3..2e1cfb44770 100644 --- a/docs/v13.0.0/features/dead_letter_archive/index.html +++ b/docs/v13.0.0/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: v13.0.0

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/features/dead_letter_queues/index.html b/docs/v13.0.0/features/dead_letter_queues/index.html index e73022d48f7..1e92c4838ea 100644 --- a/docs/v13.0.0/features/dead_letter_queues/index.html +++ b/docs/v13.0.0/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v13.0.0

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/v13.0.0/features/distribution-metrics/index.html b/docs/v13.0.0/features/distribution-metrics/index.html index 665f7767dc9..c49d6db41a9 100644 --- a/docs/v13.0.0/features/distribution-metrics/index.html +++ b/docs/v13.0.0/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v13.0.0

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/v13.0.0/features/execution_payload_retention/index.html b/docs/v13.0.0/features/execution_payload_retention/index.html index e3efeba4e73..31863aa8d47 100644 --- a/docs/v13.0.0/features/execution_payload_retention/index.html +++ b/docs/v13.0.0/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v13.0.0

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in your RDS database and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/features/logging-esdis-metrics/index.html b/docs/v13.0.0/features/logging-esdis-metrics/index.html index c525d7f7139..c7e17219215 100644 --- a/docs/v13.0.0/features/logging-esdis-metrics/index.html +++ b/docs/v13.0.0/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v13.0.0

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/features/replay-archived-sqs-messages/index.html b/docs/v13.0.0/features/replay-archived-sqs-messages/index.html index 5e0f199b962..c094a1f4638 100644 --- a/docs/v13.0.0/features/replay-archived-sqs-messages/index.html +++ b/docs/v13.0.0/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: v13.0.0

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/features/replay-kinesis-messages/index.html b/docs/v13.0.0/features/replay-kinesis-messages/index.html index d3bc14e1ed6..3596f465df4 100644 --- a/docs/v13.0.0/features/replay-kinesis-messages/index.html +++ b/docs/v13.0.0/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v13.0.0

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/features/reports/index.html b/docs/v13.0.0/features/reports/index.html index 5290aa702ef..6fdfa2f6a0a 100644 --- a/docs/v13.0.0/features/reports/index.html +++ b/docs/v13.0.0/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -19,7 +19,7 @@ report generation. The data buckets will include any buckets in your Cumulus buckets configuration that have type public, protected or private.
    - + \ No newline at end of file diff --git a/docs/v13.0.0/getting-started/index.html b/docs/v13.0.0/getting-started/index.html index 0982ce7216c..e92918ea382 100644 --- a/docs/v13.0.0/getting-started/index.html +++ b/docs/v13.0.0/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v13.0.0

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Deploy Cumulus and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    2. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    3. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment. For an introduction about Terraform go here.
    • Integrator Common Use Cases: Scenarios to help integrators along in the Cumulus environment.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/v13.0.0/glossary/index.html b/docs/v13.0.0/glossary/index.html index 0440bb1afb3..40d8fa8b7d0 100644 --- a/docs/v13.0.0/glossary/index.html +++ b/docs/v13.0.0/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: v13.0.0

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: aws.amazon.com

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see cookbook reference page.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Npm

    Node package manager.

    For more information, see npmjs.com.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (NPM)

    NPM hosted node.js packages. Cumulus packages can be found on NPM's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data and more.

    For more information, see AWS's s3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/index.html b/docs/v13.0.0/index.html index 4a1cd1719d9..d4113506cbd 100644 --- a/docs/v13.0.0/index.html +++ b/docs/v13.0.0/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v13.0.0

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/integrator-guide/about-int-guide/index.html b/docs/v13.0.0/integrator-guide/about-int-guide/index.html index 9e6b9ffbc29..cd4644398a7 100644 --- a/docs/v13.0.0/integrator-guide/about-int-guide/index.html +++ b/docs/v13.0.0/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v13.0.0

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/v13.0.0/integrator-guide/int-common-use-cases/index.html b/docs/v13.0.0/integrator-guide/int-common-use-cases/index.html index e93cb21bb7a..c23c596393c 100644 --- a/docs/v13.0.0/integrator-guide/int-common-use-cases/index.html +++ b/docs/v13.0.0/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v13.0.0/integrator-guide/workflow-add-new-lambda/index.html b/docs/v13.0.0/integrator-guide/workflow-add-new-lambda/index.html index 114cfca1d47..3bdc0305d65 100644 --- a/docs/v13.0.0/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/v13.0.0/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v13.0.0

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/integrator-guide/workflow-ts-failed-step/index.html b/docs/v13.0.0/integrator-guide/workflow-ts-failed-step/index.html index 9356d5beef8..5f6c155b85a 100644 --- a/docs/v13.0.0/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/v13.0.0/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v13.0.0

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/v13.0.0/interfaces/index.html b/docs/v13.0.0/interfaces/index.html index 72ecb2d4bfb..68dbf0441e9 100644 --- a/docs/v13.0.0/interfaces/index.html +++ b/docs/v13.0.0/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v13.0.0

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/about-operator-docs/index.html b/docs/v13.0.0/operator-docs/about-operator-docs/index.html index 1a65ba96d91..d08947851bc 100644 --- a/docs/v13.0.0/operator-docs/about-operator-docs/index.html +++ b/docs/v13.0.0/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v13.0.0

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/bulk-operations/index.html b/docs/v13.0.0/operator-docs/bulk-operations/index.html index 8218bcb4e7a..1e0a0af515b 100644 --- a/docs/v13.0.0/operator-docs/bulk-operations/index.html +++ b/docs/v13.0.0/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v13.0.0

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/cmr-operations/index.html b/docs/v13.0.0/operator-docs/cmr-operations/index.html index 83b1d53613a..bd37286d941 100644 --- a/docs/v13.0.0/operator-docs/cmr-operations/index.html +++ b/docs/v13.0.0/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/create-rule-in-cumulus/index.html b/docs/v13.0.0/operator-docs/create-rule-in-cumulus/index.html index 95b269dc97a..3db1dc0b890 100644 --- a/docs/v13.0.0/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/v13.0.0/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v13.0.0

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/discovery-filtering/index.html b/docs/v13.0.0/operator-docs/discovery-filtering/index.html index c6fda0d75bd..be52c8e915b 100644 --- a/docs/v13.0.0/operator-docs/discovery-filtering/index.html +++ b/docs/v13.0.0/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/granule-workflows/index.html b/docs/v13.0.0/operator-docs/granule-workflows/index.html index debd110e069..0e149c3cbab 100644 --- a/docs/v13.0.0/operator-docs/granule-workflows/index.html +++ b/docs/v13.0.0/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v13.0.0

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/kinesis-stream-for-ingest/index.html b/docs/v13.0.0/operator-docs/kinesis-stream-for-ingest/index.html index 1f5fcc3f831..15940d3cebe 100644 --- a/docs/v13.0.0/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/v13.0.0/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v13.0.0

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/locating-access-logs/index.html b/docs/v13.0.0/operator-docs/locating-access-logs/index.html index 65ae2974aeb..20f9c6ca182 100644 --- a/docs/v13.0.0/operator-docs/locating-access-logs/index.html +++ b/docs/v13.0.0/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v13.0.0

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/naming-executions/index.html b/docs/v13.0.0/operator-docs/naming-executions/index.html index bac510d830c..156ab8099ca 100644 --- a/docs/v13.0.0/operator-docs/naming-executions/index.html +++ b/docs/v13.0.0/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Please note: This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/ops-common-use-cases/index.html b/docs/v13.0.0/operator-docs/ops-common-use-cases/index.html index 08ff35f3315..02b21a4dbdb 100644 --- a/docs/v13.0.0/operator-docs/ops-common-use-cases/index.html +++ b/docs/v13.0.0/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v13.0.0/operator-docs/trigger-workflow/index.html b/docs/v13.0.0/operator-docs/trigger-workflow/index.html index 7f6f9cffb61..eddebe4f138 100644 --- a/docs/v13.0.0/operator-docs/trigger-workflow/index.html +++ b/docs/v13.0.0/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v13.0.0

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/tasks/index.html b/docs/v13.0.0/tasks/index.html index 63280c59a16..aab35f7105f 100644 --- a/docs/v13.0.0/tasks/index.html +++ b/docs/v13.0.0/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v13.0.0

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    NOTE: For a detailed description of each task, visit the task's README.md. Information on the input or output of a task is specified in the task's schemas directory.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/v13.0.0/team/index.html b/docs/v13.0.0/team/index.html index a0a34855177..3a7f6f3689c 100644 --- a/docs/v13.0.0/team/index.html +++ b/docs/v13.0.0/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - +
    Version: v13.0.0

    Cumulus Team

    Cumulus Core Team

    Cumulus Emeritus Team

    - + \ No newline at end of file diff --git a/docs/v13.0.0/troubleshooting/index.html b/docs/v13.0.0/troubleshooting/index.html index ae27051cb01..c2e0942393b 100644 --- a/docs/v13.0.0/troubleshooting/index.html +++ b/docs/v13.0.0/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v13.0.0

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/troubleshooting/reindex-elasticsearch/index.html b/docs/v13.0.0/troubleshooting/reindex-elasticsearch/index.html index b0ad411268f..0af104a9d93 100644 --- a/docs/v13.0.0/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/v13.0.0/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/troubleshooting/rerunning-workflow-executions/index.html b/docs/v13.0.0/troubleshooting/rerunning-workflow-executions/index.html index bb737008f4d..bf2ca20f692 100644 --- a/docs/v13.0.0/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/v13.0.0/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v13.0.0

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/v13.0.0/troubleshooting/troubleshooting-deployment/index.html b/docs/v13.0.0/troubleshooting/troubleshooting-deployment/index.html index c11eb518738..e0b67b62ea7 100644 --- a/docs/v13.0.0/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/v13.0.0/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/v13.0.0/upgrade-notes/cumulus_distribution_migration/index.html b/docs/v13.0.0/upgrade-notes/cumulus_distribution_migration/index.html index b10678df46c..3b88f4db6df 100644 --- a/docs/v13.0.0/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/v13.0.0/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: v13.0.0

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/upgrade-notes/migrate_tea_standalone/index.html b/docs/v13.0.0/upgrade-notes/migrate_tea_standalone/index.html index 1c374a8801e..21bb30ec7d0 100644 --- a/docs/v13.0.0/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/v13.0.0/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v13.0.0

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/upgrade-notes/update-cma-2.0.2/index.html b/docs/v13.0.0/upgrade-notes/update-cma-2.0.2/index.html index c463b3867cd..9b24540f291 100644 --- a/docs/v13.0.0/upgrade-notes/update-cma-2.0.2/index.html +++ b/docs/v13.0.0/upgrade-notes/update-cma-2.0.2/index.html @@ -5,13 +5,13 @@ Upgrade to CMA 2.0.2 | Cumulus Documentation - +
    Version: v13.0.0

    Upgrade to CMA 2.0.2

    Updating a Cumulus Deployment to CMA 2.0.2

    Background

    The Cumulus Message Adapter has been updated in release 2.0.2 to no longer utilize the AWS step function API to look up the defined name of a step function task for population in meta.workflow_tasks, but instead use an incrementing integer field.

    Additionally a bugfix was released in the form of v2.0.1/v2.0.2 following the initial 2.0.0 release, so all users should update to release 2.0.2

    The update is not tied to a particular version of Core, however the update should be done across all task components in order to ensure consistent execution records.

    Changes

    Execution Record Update

    This update functionally means that Cumulus tasks/activities using the CMA will now record a record that looks like the following in meta.workflowtasks, and more importantly in the tasks column for an execution record:

    Original

          "DiscoverGranules": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "QueueGranules": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    New

          "0": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "1": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    Actions Required

    The following should be done as part of a Cumulus stack update to utilize cumulus message adapter > 2.0.2:

    • Python tasks that utilize cumulus-message-adapter-python should be updated to use > 2.0.0, their lambdas rebuilt and Cumulus workflows reconfigured to use the updated version.

    • Python activities that utilize cumulus-process-py should be rebuilt using > 1.0.0 with updated dependencies, and have their images deployed/Cumulus configured to use the new version.

    • The cumulus-message-adapter v2.0.2 lambda layer should be made available in the deployment account, and the Cumulus deployment should be reconfigured to use it (via the cumulus_message_adapter_lambda_layer_version_arn variable in the cumulus module). This should address all Core node.js tasks that utilize the CMA, and many contributed node.js/JAVA components.

    Once the above have been done, redeploy Cumulus to apply the configuration and the updates should be live.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/upgrade-notes/update-task-file-schemas/index.html b/docs/v13.0.0/upgrade-notes/update-task-file-schemas/index.html index 1c7d32dfa91..1bb7d170f19 100644 --- a/docs/v13.0.0/upgrade-notes/update-task-file-schemas/index.html +++ b/docs/v13.0.0/upgrade-notes/update-task-file-schemas/index.html @@ -5,13 +5,13 @@ Updates to task granule file schemas | Cumulus Documentation - +
    Version: v13.0.0

    Updates to task granule file schemas

    Background

    Most Cumulus workflow tasks expect as input a payload of granule(s) which contain the files for each granule. Most tasks also return this same granule structure as output.

    However, up to this point, there was inconsistency in the schemas for the granule files objects expected by each task. Furthermore, there was no guarantee of consistency between granule files objects as stored in the database and the expectations of any given workflow task.

    Thus, when performing bulk granule operations which pass granules from the database into a Cumulus workflow, it was possible for there to be schema validation failures depending on which task was used to start the workflow and its particular schema.

    In order to rectify this situation, CUMULUS-2388 was filed and addressed to create a common granule files schema between nearly all of the Cumulus tasks (exceptions discussed below) and the Cumulus database. The following documentation explains the manual changes you need to make to your deployment in order to be compatible with the updated files schema.

    Updated files schema

    The updated granule files schema can be found here.

    These former properties were deprecated (with notes about how to derive the same information from the updated schema, if possible):

    • filename - concatenate the bucket and key values with a directory separator (/)
    • name - use fileName property
    • etag - ETags are no longer provided as an individual file property. Instead, a separate etags object mapping S3 URIs to ETag values is provided as output from the following workflow tasks (guidance on how to integrate this output with your workflows is provided in the Upgrading your workflows section below):
      • update-granules-cmr-metadata-file-links
      • hyrax-metadata-updates
    • fileStagingDir - no longer supported
    • url_path - no longer supported
    • duplicate_found - This property is no longer supported, however sync-granule and move-granules now produce a separate granuleDuplicates object as part of their output. The granuleDuplicates object is a map of granules by granule ID which includes the files that encountered duplicates during processing. Guidance on how to integrate granuleDuplicates information into your workflow configuration is provided below.

    Exceptions

    These workflow tasks did not have their schema for granule files updated:

    • discover-granules - no updates
    • queue-granules - no updates
    • parse-pdr - no updates
    • sync-granule - input schema not updated, output schema was updated

    The reason that these task schemas were not updated is that all of these tasks start before the files have been ingested to S3, thus much of the information that is required in the updated files schema like bucket, key, or checksum is not yet known.

    Bulk granule operations

    Since the input schema for the above tasks was not updated, that means you cannot run bulk granule operations against workflows if they start with any of those tasks. Bulk granule operations work by loading the specified granules from the database and sending them as input to a specified workflow, so if the specified workflow begins with a task whose input schema does not conform to what is coming out of the database, there will be schema errors.

    Upgrading your deployment

    Upgrading your workflows

    For any workflows using the update-granules-cmr-metadata-file-links task before the hyrax-metadata-updates and/or post-to-cmr tasks, update the step definition for update-granules-cmr-metadata-file-links as follows:

        "UpdateGranulesCmrMetadataFileLinksStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    hyrax-metadata-updates

    For any workflows using the hyrax-metadata-updates task before a post-to-cmr task, update the definition of the hyrax-metadata-updates step as follows:

        "HyraxMetadataUpdatesTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    post-to-cmr

    For any workflows using post-to-cmr task after the update-granules-cmr-metadata-file-links or hyrax-metadata-updates tasks, update the post-to-cmr step definition as follows:

        "CmrStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}"
    }
    }
    },
    ...more configuration...

    Example workflow

    For an example workflow integrating all of these changes, please see our example ingest and publish workflow.

    Optional - Integrate granuleDuplicates information

    Please note that the granuleDuplicates output is purely informational and does not have any bearing on the separate configuration for how duplicates should be handled.

    You can include granuleDuplicates output from the sync-granule or move-granules tasks in your workflow messages like so:

        "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    ...other config...
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granuleDuplicates}",
    "destination": "{$.meta.sync_granule.granule_duplicates}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    }
    ...more configuration...

    The result of this configuration is that the granuleDuplicates output from sync-granule would be placed in meta.sync_granule.granule_duplicates on the workflow message and remain there throughout the rest of the workflow. The same configuration could be replicated for the move-granules task, but be sure to use a different destination in the workflow message for the granuleDuplicates output .

    Updating collection URL path templates

    Collections can specify url_path templates to dynamically generate the final location of files. As part of url_path templates, file object properties can be interpolated to generate the file path. Thus, these url_path templates need to be updated to ensure that they are compatible with the updated files schema and the properties that will actually be available on file objects.

    See the notes on the updated files schema to know which properties are available and which previously existing properties were deprecated.

    As an example, you will want to update any url_path properties in your collections to remove references to file.name and replace them with references to file.fileName like so:

    - "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.name, 0, 3)}",
    + "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.fileName, 0, 3)}",
    - + \ No newline at end of file diff --git a/docs/v13.0.0/upgrade-notes/upgrade-rds/index.html b/docs/v13.0.0/upgrade-notes/upgrade-rds/index.html index 524c85117c6..7c226d5648e 100644 --- a/docs/v13.0.0/upgrade-notes/upgrade-rds/index.html +++ b/docs/v13.0.0/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/v13.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/v13.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html index 3950caf5a97..edd412f648d 100644 --- a/docs/v13.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/v13.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v13.0.0

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the upgrade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflow_tasks/discover_granules/index.html b/docs/v13.0.0/workflow_tasks/discover_granules/index.html index c363cb45089..6c6313452ef 100644 --- a/docs/v13.0.0/workflow_tasks/discover_granules/index.html +++ b/docs/v13.0.0/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflow_tasks/files_to_granules/index.html b/docs/v13.0.0/workflow_tasks/files_to_granules/index.html index 9f7c3890c1c..b381652859e 100644 --- a/docs/v13.0.0/workflow_tasks/files_to_granules/index.html +++ b/docs/v13.0.0/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v13.0.0

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • size
    • bucket
    • key
    • fileName

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflow_tasks/lzards_backup/index.html b/docs/v13.0.0/workflow_tasks/lzards_backup/index.html index c6f209a5072..5b489236845 100644 --- a/docs/v13.0.0/workflow_tasks/lzards_backup/index.html +++ b/docs/v13.0.0/workflow_tasks/lzards_backup/index.html @@ -5,13 +5,13 @@ LZARDS Backup | Cumulus Documentation - +
    Version: v13.0.0

    LZARDS Backup

    The LZARDS backup task takes an array of granules and initiates backup requests to the LZARDS API, which will be handled asynchronously by LZARDS.

    Deployment

    The LZARDS backup task is not automatically deployed with Cumulus. To deploy the task through the Cumulus module, first you must specify a lzards_launchpad_passphrase in your terraform variables (e.g. variables.tf) like so:

    variable "lzards_launchpad_passphrase" {
    type = string
    default = ""
    }

    Then you can specify a value for your lzards_launchpad_passphrase in terraform.tfvars like so:

    lzards_launchpad_passphrase = your-passphrase

    Lastly, you need to make sure that the lzards_launchpad_passphrase is passed into the Cumulus module (in main.tf) like so:

    lzards_launchpad_passphrase  = var.lzards_launchpad_passphrase

    In short, deploying the LZARDS task requires configuring a passphrase variable and ensuring that your TF configuration passes that variable into the Cumulus module.

    Additional terraform configuration for the LZARDS task can be found in the cumulus module's variables.tf file, where the the relevant variables are prefixed with lzards_. You can add these variables to your deployment using the same process outlined above for lzards_launchpad_passphrase.

    Task Inputs

    Input

    This task expects an array of granules as input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Task Outputs

    Output

    The LZARDS task outputs a composite object containing:

    • the input granules array, and
    • a backupResults object that describes the results of LZARDS backup attempts.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflow_tasks/move_granules/index.html b/docs/v13.0.0/workflow_tasks/move_granules/index.html index 4e64e5607de..361a70954e5 100644 --- a/docs/v13.0.0/workflow_tasks/move_granules/index.html +++ b/docs/v13.0.0/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v13.0.0

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflow_tasks/parse_pdr/index.html b/docs/v13.0.0/workflow_tasks/parse_pdr/index.html index 3b6c9ec395e..bfbd52c9d85 100644 --- a/docs/v13.0.0/workflow_tasks/parse_pdr/index.html +++ b/docs/v13.0.0/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v13.0.0

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflow_tasks/queue_granules/index.html b/docs/v13.0.0/workflow_tasks/queue_granules/index.html index 4e319a455a1..5bb251d5946 100644 --- a/docs/v13.0.0/workflow_tasks/queue_granules/index.html +++ b/docs/v13.0.0/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: v13.0.0

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflows/cumulus-task-message-flow/index.html b/docs/v13.0.0/workflows/cumulus-task-message-flow/index.html index 503940cfbe9..0b15a791978 100644 --- a/docs/v13.0.0/workflows/cumulus-task-message-flow/index.html +++ b/docs/v13.0.0/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v13.0.0

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflows/developing-a-cumulus-workflow/index.html b/docs/v13.0.0/workflows/developing-a-cumulus-workflow/index.html index b2b6a82bba7..ccb9ba5d472 100644 --- a/docs/v13.0.0/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/v13.0.0/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v13.0.0

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflows/developing-workflow-tasks/index.html b/docs/v13.0.0/workflows/developing-workflow-tasks/index.html index d0ed4f735c5..670edcc8d3b 100644 --- a/docs/v13.0.0/workflows/developing-workflow-tasks/index.html +++ b/docs/v13.0.0/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v13.0.0

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflows/docker/index.html b/docs/v13.0.0/workflows/docker/index.html index a18aab4b16a..eb2a0d748d9 100644 --- a/docs/v13.0.0/workflows/docker/index.html +++ b/docs/v13.0.0/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflows/index.html b/docs/v13.0.0/workflows/index.html index c4cddc9ea09..ab40afae250 100644 --- a/docs/v13.0.0/workflows/index.html +++ b/docs/v13.0.0/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v13.0.0

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflows/input_output/index.html b/docs/v13.0.0/workflows/input_output/index.html index ef4e8ea2cd4..b98caa3ba96 100644 --- a/docs/v13.0.0/workflows/input_output/index.html +++ b/docs/v13.0.0/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v13.0.0

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflows/lambda/index.html b/docs/v13.0.0/workflows/lambda/index.html index dbc609c13c7..2bc5622b48c 100644 --- a/docs/v13.0.0/workflows/lambda/index.html +++ b/docs/v13.0.0/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v13.0.0

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflows/protocol/index.html b/docs/v13.0.0/workflows/protocol/index.html index f8f7dee69c1..3c03ba1b8c3 100644 --- a/docs/v13.0.0/workflows/protocol/index.html +++ b/docs/v13.0.0/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v13.0.0

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflows/workflow-configuration-how-to/index.html b/docs/v13.0.0/workflows/workflow-configuration-how-to/index.html index 67fa0f461e2..3d0d7a4556a 100644 --- a/docs/v13.0.0/workflows/workflow-configuration-how-to/index.html +++ b/docs/v13.0.0/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -24,7 +24,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.fileName, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/v13.0.0/workflows/workflow-triggers/index.html b/docs/v13.0.0/workflows/workflow-triggers/index.html index 2d6a68822bc..798caf87be8 100644 --- a/docs/v13.0.0/workflows/workflow-triggers/index.html +++ b/docs/v13.0.0/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v13.0.0

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/v13.4.0/adding-a-task/index.html b/docs/v13.4.0/adding-a-task/index.html index d977d59dfd3..e91787a558e 100644 --- a/docs/v13.4.0/adding-a-task/index.html +++ b/docs/v13.4.0/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: v13.4.0

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/api/index.html b/docs/v13.4.0/api/index.html index 396c3f7c88b..abbc62375af 100644 --- a/docs/v13.4.0/api/index.html +++ b/docs/v13.4.0/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v13.4.0/architecture/index.html b/docs/v13.4.0/architecture/index.html index 6961a2cf749..3429e368522 100644 --- a/docs/v13.4.0/architecture/index.html +++ b/docs/v13.4.0/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: v13.4.0

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a set of PostgreSQL compatible database, and is exported to an Elasticsearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries. Currently the entity state data is replicated in DynamoDB and this will be removed in a future release.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Database

    Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance. For legacy reasons, Cumulus is currently also deploying a DynamoDB datastore, and writes are replicated in parallel with the PostgreSQL database writes. The DynamoDB replicated tables and parallel writes will be removed in future releases.

    PostgreSQL Database Schema Diagram

    ERD of the Cumulus Database

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/v13.4.0/configuration/cloudwatch-retention/index.html b/docs/v13.4.0/configuration/cloudwatch-retention/index.html index 2b44c32ccf0..12532cec90e 100644 --- a/docs/v13.4.0/configuration/cloudwatch-retention/index.html +++ b/docs/v13.4.0/configuration/cloudwatch-retention/index.html @@ -5,13 +5,13 @@ Cloudwatch Retention | Cumulus Documentation - +
    Version: v13.4.0

    Cloudwatch Retention

    Our lambdas dump logs to AWS CloudWatch. By default, these logs exist indefinitely. However, there are ways to specify a duration for log retention.

    aws-cli

    In addition to getting your aws-cli set-up, there are two values you'll need to acquire.

    1. log-group-name: the name of the log group who's retention policy (retention time) you'd like to change. We'll use /aws/lambda/KinesisInboundLogger in our examples.
    2. retention-in-days: the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    For example, if we wanted to set log retention to 30 days on our KinesisInboundLogger lambda, we would write:

    aws logs put-retention-policy --log-group-name "/aws/lambda/KinesisInboundLogger" --retention-in-days 30

    Note: The aws-cli log command that we're using is explained in detail here.

    AWS Management Console

    Changing the log retention policy in the AWS Management Console is a fairly simple process:

    1. Navigate to the CloudWatch service in the AWS Management Console.
    2. Click on the Logs entry on the sidebar.
    3. Find the Log Group who's retention policy you're interested in changing.
    4. Click on the value in the Expire Events After column.
    5. Enter/Select the number of days you'd like to retain logs in that log group for.

    Screenshot of AWS console showing how to configure the retention period for Cloudwatch logs

    - + \ No newline at end of file diff --git a/docs/v13.4.0/configuration/collection-storage-best-practices/index.html b/docs/v13.4.0/configuration/collection-storage-best-practices/index.html index 69f7ac86c1a..21231d90563 100644 --- a/docs/v13.4.0/configuration/collection-storage-best-practices/index.html +++ b/docs/v13.4.0/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: v13.4.0

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/v13.4.0/configuration/data-management-types/index.html b/docs/v13.4.0/configuration/data-management-types/index.html index d22f6a0b8ea..429a9490140 100644 --- a/docs/v13.4.0/configuration/data-management-types/index.html +++ b/docs/v13.4.0/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: v13.4.0

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    Please note:

    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certificateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certiciateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22
    privateKeystringNofilename assumed to be in s3://bucketInternal/stackName/crypto
    cmKeyIdstringNoAWS KMS Customer Master Key arn or alias

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.fileName, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.fileName, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v13.4.0/configuration/lifecycle-policies/index.html b/docs/v13.4.0/configuration/lifecycle-policies/index.html index 16083ab2435..4b0a3563c8f 100644 --- a/docs/v13.4.0/configuration/lifecycle-policies/index.html +++ b/docs/v13.4.0/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: v13.4.0

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walk-through on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/v13.4.0/configuration/monitoring-readme/index.html b/docs/v13.4.0/configuration/monitoring-readme/index.html index f44e6e05c08..35a0cbafba5 100644 --- a/docs/v13.4.0/configuration/monitoring-readme/index.html +++ b/docs/v13.4.0/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: v13.4.0

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return await Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/configuration/server_access_logging/index.html b/docs/v13.4.0/configuration/server_access_logging/index.html index cad1b02d11c..f67d3f8d707 100644 --- a/docs/v13.4.0/configuration/server_access_logging/index.html +++ b/docs/v13.4.0/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: v13.4.0

    S3 Server Access Logging

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/v13.4.0/configuration/task-configuration/index.html b/docs/v13.4.0/configuration/task-configuration/index.html index 154bc6b8630..a6ef7a8e4e7 100644 --- a/docs/v13.4.0/configuration/task-configuration/index.html +++ b/docs/v13.4.0/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
    Version: v13.4.0

    Configuration of Tasks

    The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

    cmr_search_client_config

    Configuration parameters for CMR search client for cumulus archive module tasks in the form:

    <lambda_identifier>_report_cmr_limit = <maximum number records can be returned from cmr-client search, this should be greater than cmr_page_size>
    <lambda_identifier>_report_cmr_page_size = <number of records for each page returned from CMR>
    type = map(string)

    More information about cmr limit and cmr page_size can be found from @cumulus/cmr-client and CMR Search API document.

    Currently the following values are supported:

    • create_reconciliation_report_cmr_limit
    • create_reconciliation_report_cmr_page_size

    Example

    cmr_search_client_config = {
    create_reconciliation_report_cmr_limit = 2500
    create_reconciliation_report_cmr_page_size = 250
    }

    elasticsearch_client_config

    Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

    <lambda_identifier>_es_scroll_duration = <duration>
    <lambda_identifier>_es_scroll_size = <size>
    type = map(string)

    Currently the following values are supported:

    • create_reconciliation_report_es_scroll_duration
    • create_reconciliation_report_es_scroll_size

    Example

    elasticsearch_client_config = {
    create_reconciliation_report_es_scroll_duration = "15m"
    create_reconciliation_report_es_scroll_size = 2000
    }

    lambda_timeouts

    A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_timeout: <timeout>
    type = map(string)

    Currently the following values are supported:

    • discover_granules_task_timeout
    • discover_pdrs_task_timeout
    • fake_processing_task_timeout
    • files_to_granules_task_timeout
    • hello_world_task_timeout
    • hyrax_metadata_update_tasks_timeout
    • lzards_backup_task_timeout
    • move_granules_task_timeout
    • parse_pdr_task_timeout
    • pdr_status_check_task_timeout
    • post_to_cmr_task_timeout
    • queue_granules_task_timeout
    • queue_pdrs_task_timeout
    • queue_workflow_task_timeout
    • sf_sqs_report_task_timeout
    • sync_granule_task_timeout
    • update_granules_cmr_metadata_file_links_task_timeout

    Example

    lambda_timeouts = {
    discover_granules_task_timeout = 300
    }

    lambda_memory_sizes

    A configurable map of memory sizes (in MBs) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_memory_size: <memory_size>
    type = map(string)

    Currently the following values are supported:

    • add_missing_file_checksums_task_memory_size
    • discover_granules_task_memory_size
    • discover_pdrs_task_memory_size
    • fake_processing_task_memory_size
    • hyrax_metadata_updates_task_memory_size
    • lzards_backup_task_memory_size
    • move_granules_task_memory_size
    • parse_pdr_task_memory_size
    • pdr_status_check_task_memory_size
    • post_to_cmr_task_memory_size
    • queue_granules_task_memory_size
    • queue_pdrs_task_memory_size
    • queue_workflow_task_memory_size
    • sf_sqs_report_task_memory_size
    • sync_granule_task_memory_size
    • update_cmr_acess_constraints_task_memory_size
    • update_granules_cmr_metadata_file_links_task_memory_size

    Example

    lambda_memory_sizes = {
    queue_granules_task_memory_size = 1036
    }
    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/about-cookbooks/index.html b/docs/v13.4.0/data-cookbooks/about-cookbooks/index.html index c0dcc3cd624..b2ee5a845ca 100644 --- a/docs/v13.4.0/data-cookbooks/about-cookbooks/index.html +++ b/docs/v13.4.0/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: v13.4.0

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to configure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/browse-generation/index.html b/docs/v13.4.0/data-cookbooks/browse-generation/index.html index b647f4e185c..5662232bcc5 100644 --- a/docs/v13.4.0/data-cookbooks/browse-generation/index.html +++ b/docs/v13.4.0/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg"
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/choice-states/index.html b/docs/v13.4.0/data-cookbooks/choice-states/index.html index d3dda3ca968..c2837b7433d 100644 --- a/docs/v13.4.0/data-cookbooks/choice-states/index.html +++ b/docs/v13.4.0/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: v13.4.0

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/cnm-workflow/index.html b/docs/v13.4.0/data-cookbooks/cnm-workflow/index.html index 77580a889e8..69208579d1f 100644 --- a/docs/v13.4.0/data-cookbooks/cnm-workflow/index.html +++ b/docs/v13.4.0/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v13.4.0

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/cnm_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }

    Please Note:

    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

    Please note: The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/error-handling/index.html b/docs/v13.4.0/data-cookbooks/error-handling/index.html index ba0de487460..010105af2f6 100644 --- a/docs/v13.4.0/data-cookbooks/error-handling/index.html +++ b/docs/v13.4.0/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/hello-world/index.html b/docs/v13.4.0/data-cookbooks/hello-world/index.html index 307fe8e7e28..c50b177c332 100644 --- a/docs/v13.4.0/data-cookbooks/hello-world/index.html +++ b/docs/v13.4.0/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: v13.4.0

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/ingest-notifications/index.html b/docs/v13.4.0/data-cookbooks/ingest-notifications/index.html index 5ed8f8c388e..a397a4c9744 100644 --- a/docs/v13.4.0/data-cookbooks/ingest-notifications/index.html +++ b/docs/v13.4.0/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: v13.4.0

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics, one for executions, granules, and PDRs, are created and used for handling notification messages related to the workflow.

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates the RDS database records for granules, executions, and PDRs. When the records are updated, messages are posted to the three SNS topics. This Lambda is invoked both when the workflow starts and when it reaches a terminal state (completion or failure).

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/queue-post-to-cmr/index.html b/docs/v13.4.0/data-cookbooks/queue-post-to-cmr/index.html index f20a5423685..98e035b09d5 100644 --- a/docs/v13.4.0/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/v13.4.0/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: v13.4.0

    Queue PostToCmr

    In this document, we walk through handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/v13.4.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index e00f6415352..3cc7397a6e2 100644 --- a/docs/v13.4.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/v13.4.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: v13.4.0

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.7.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn,
    "--lastModified",
    module.cumulus.queue_granules_task.last_modified_date
    ]
    alarms = {
    MemoryUtilizationHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 75
    }
    }
    }

    Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/sips-workflow/index.html b/docs/v13.4.0/data-cookbooks/sips-workflow/index.html index 55cdd8438bf..44dbf6beb41 100644 --- a/docs/v13.4.0/data-cookbooks/sips-workflow/index.html +++ b/docs/v13.4.0/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/throttling-queued-executions/index.html b/docs/v13.4.0/data-cookbooks/throttling-queued-executions/index.html index f63960a65fa..6a8c34a4d99 100644 --- a/docs/v13.4.0/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/v13.4.0/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: v13.4.0

    Throttling queued executions

    In this entry, we will walk through how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/v13.4.0/data-cookbooks/tracking-files/index.html b/docs/v13.4.0/data-cookbooks/tracking-files/index.html index fc1540d52ae..475b3f52a61 100644 --- a/docs/v13.4.0/data-cookbooks/tracking-files/index.html +++ b/docs/v13.4.0/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/api-gateway-logging/index.html b/docs/v13.4.0/deployment/api-gateway-logging/index.html index a86d91e55f7..6dfb05aefb1 100644 --- a/docs/v13.4.0/deployment/api-gateway-logging/index.html +++ b/docs/v13.4.0/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: v13.4.0

    API Gateway Logging

    Enabling API Gateway logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions for enabling account level logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    Create a policy document

    The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

    Save this snippet as apigateway-policy.json.

    {
    "Version": "2012-10-17",
    "Statement": [
    {
    "Sid": "",
    "Effect": "Allow",
    "Principal": {
    "Service": "apigateway.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
    }
    ]
    }

    Create an account role to act as ApiGateway and write to CloudWatchLogs

    NASA users in NGAP: be sure to use your account's permission boundary.

    aws iam create-role \
    --role-name ApiGatewayToCloudWatchLogs \
    [--permissions-boundary <permissionBoundaryArn>] \
    --assume-role-policy-document file://apigateway-policy.json

    Note the ARN of the returned role for the last step.

    Attach correct permissions to role

    Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

    aws iam attach-role-policy \
    --role-name ApiGatewayToCloudWatchLogs \
    --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"

    Update Account API Gateway settings with correct permissions

    Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

    aws apigateway update-account \
    --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    See Configure Cloudwatch Logs Delivery

    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/choosing_configuring_rds/index.html b/docs/v13.4.0/deployment/choosing_configuring_rds/index.html index bb42a584875..c3a84c63629 100644 --- a/docs/v13.4.0/deployment/choosing_configuring_rds/index.html +++ b/docs/v13.4.0/deployment/choosing_configuring_rds/index.html @@ -5,7 +5,7 @@ Choosing and configuration your RDS database | Cumulus Documentation - + @@ -37,7 +37,7 @@ using this module to create your RDS cluster, you can configure the autoscaling timeout action, the cluster minimum and maximum capacity, and more as seen in the supported variables for the module.

    Unfortunately, Terraform currently doesn't allow specifying the autoscaling timeout itself, so that value will have to be manually configured in the AWS console or CLI.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/cloudwatch-logs-delivery/index.html b/docs/v13.4.0/deployment/cloudwatch-logs-delivery/index.html index d5c873ab532..bdd44cf1960 100644 --- a/docs/v13.4.0/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/v13.4.0/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: v13.4.0

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    Be default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/components/index.html b/docs/v13.4.0/deployment/components/index.html index f2f2d208605..27d6b2efb08 100644 --- a/docs/v13.4.0/deployment/components/index.html +++ b/docs/v13.4.0/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walk-through of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/create_bucket/index.html b/docs/v13.4.0/deployment/create_bucket/index.html index 73ed4f13b1b..5ea67516fdf 100644 --- a/docs/v13.4.0/deployment/create_bucket/index.html +++ b/docs/v13.4.0/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: v13.4.0

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command line

    Using the AWS command line tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }

    Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web interface

    See: AWS "Creating a Bucket" documentation

    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/cumulus_distribution/index.html b/docs/v13.4.0/deployment/cumulus_distribution/index.html index 91718e9607d..dbbdc80c141 100644 --- a/docs/v13.4.0/deployment/cumulus_distribution/index.html +++ b/docs/v13.4.0/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
    Version: v13.4.0

    Using the Cumulus Distribution API

    The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

    Configuring a Cumulus Distribution deployment

    The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

    These steps assume you're using the Cumulus Deployment Template but can also be used for custom deployments.

    To configure a deployment to use Cumulus Distribution:

    1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the Cumulus Distribution settings.
    2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
    3. Uncomment the Cumulus Distribution outputs in outputs.tf.
    4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

    Cognito Application and User Credentials

    The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

    Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

    Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

    • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
    • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
    • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

    Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

    Cumulus Distribution URL

    Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

    By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

    In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

    You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file, and setting it to one of the following values (both of which are explained below):

    1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
    2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

    The following subsections explain these approaches, in turn.

    Using your Cumulus Distribution API Gateway URL as your distribution URL

    Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development), as described here. Here is an outline of the required steps, with details provided further below:

    1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
    2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
    3. Choose an open local port on your machine (we'll use 9000 in the following details)
    4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
    5. Redeploy Cumulus
    6. Add an entry to your /etc/hosts file
    7. Add a redirect URI to Cognito, via the Cognito API
    8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
    9. Add a sample file to S3 to test downloading via Cognito

    To create or import an existing key pair, you can use the AWS CLI (see aws ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

    Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

    key_name = "<name>"
    cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

    where:

    • <name> is the name of the key pair you just added to AWS
    • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
    • <port> is your open local port of choice (9000 is typically a good choice)

    Once you save your variable changes, redeploy your cumulus-tf module.

    While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

    localhost <hostname>

    Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

    • method: POST
    • base URL: the value of your csdap_host_url Terraform variable
    • path: /authclient/updateRedirectUri
    • username: the value of your csdap_client_id Terraform variable
    • password: the value of your csdap_client_password Terraform variable
    • headers: Content-Type='application/x-www-form-urlencoded'
    • body: redirect_uri=<cumulus_distribution_url>/login

    where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

    For reference, see the Cognito Authentication Service API.

    Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

    brew install --cask session-manager-plugin --no-quarantine

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

    1. Determine your ec2 instance ID
    2. Connect to the NASA VPN
    3. Start an AWS SSM session
    4. Open an ssh tunnel
    5. Use a browser to navigate to your file

    To determine your ec2 instance ID for your Cumulus deployment, run the follow command, where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

    aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text

    IMPORTANT: Before proceeding with the remaining steps, make sure you're connected to the NASA VPN.

    Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

    aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

    If successful, you should see output similar to the following:

    Starting session with SessionId: NGAPShApplicationDeveloper-***
    Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
    Waiting for connections...

    Open another terminal window, and open a tunnel with port forwarding, using your chosen port from above (e.g., 9000):

    ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

    where:

    • <port> is the open local port you chose earlier (e.g., 9000)
    • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    Once you're finished testing, clean up as follows:

    1. Kill your ssh tunnel (Ctrl-C)
    2. Kill your AWS SSM session (Ctrl-C)
    3. If you like, disconnect from the NASA VPC

    While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

    Using a CloudFront URL as your distribution URL

    In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

    This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

    • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
    • A request to add the AWS account's VPC to the whitelist

    Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

    cumulus_distribution_url = <cloudfront_url>

    In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

    At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

    Redeploy Cumulus with your new/updated Terraform variables.

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    S3 Bucket Mapping

    An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

    The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Note: Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Switching from the Thin Egress App to Cumulus Distribution

    If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

    Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

    There will also be downtime while your API gateway is updated.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/index.html b/docs/v13.4.0/deployment/index.html index 58ce4f009ec..47941c1a162 100644 --- a/docs/v13.4.0/deployment/index.html +++ b/docs/v13.4.0/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -21,7 +21,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Choose a distribution API

    Cumulus can be configured to use either the Thin Egress App (TEA) or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

    IMPORTANT! If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    Configure the Thin Egress App

    The Thin Egress App can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the documentation to configure distribution in your cumulus-tf deployment.

    Configure the Cumulus Distribution API (optional)

    If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

    Note: Be sure to copy the redirect URLs, as you will use them to update your Earthdata application.

    Update Earthdata Application

    You will need to add two redirect URLs to your EarthData login application.

    1. Login to URS.
    2. Under My Applications -> Application Administration -> use the edit icon of your application.
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token.
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1.
    5. You may delete the placeholder url you used to create the application.

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus dashboard

    Dashboard Requirements

    Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the dashboard repository.

    Prepare AWS

    Create S3 bucket for dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install dashboard

    To install the dashboard, clone the Cumulus dashboard repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard versioning

    By default, the master branch will be used for dashboard deployments. The master branch of the dashboard repo contains the most recent stable release of the dashboard.

    If you want to test unreleased changes to the dashboard, use the develop branch.

    Each release/version of the dashboard will have a tag in the dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the dashboard

    Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build the dashboard from the dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard deployment

    Deploy dashboard to s3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard --acl public-read

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and login with a user that you configured for access in the Configure and Deploy the Cumulus Stack step.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 instances and autoscaling groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/postgres_database_deployment/index.html b/docs/v13.4.0/deployment/postgres_database_deployment/index.html index 9118ff0bcd8..02ca8b8b770 100644 --- a/docs/v13.4.0/deployment/postgres_database_deployment/index.html +++ b/docs/v13.4.0/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 10.2 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases, and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs, and the Aurora Serverless V1 docs.

    Prepare deployment repository

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a new repository

    Create a new repository on Github so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

    --

    Configure and deploy the module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc. This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision user and user database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true, this configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to
    • prefix -- this value will be used to set a unique identifier the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda, and run it on each provision, creating the configured database if it does not exist, updating the user password if that value has been changed, and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    Please Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see output like:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secret required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are is in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Next Steps

    The database cluster has been created/updated! From here you can continue to add additional user accounts, databases and other database configuration.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/share-s3-access-logs/index.html b/docs/v13.4.0/deployment/share-s3-access-logs/index.html index 0bf8127797a..d99e97192a5 100644 --- a/docs/v13.4.0/deployment/share-s3-access-logs/index.html +++ b/docs/v13.4.0/deployment/share-s3-access-logs/index.html @@ -5,14 +5,14 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: v13.4.0

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a node package that contains a simple lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First ensure that you have enabled S3 Server Access Logging.

    Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition. e.g.

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The terraform source package can be found on the Cumulus github release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 replicator as described above using the target_bucket and target_prefix provided by the metrics team.

    The metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/terraform-best-practices/index.html b/docs/v13.4.0/deployment/terraform-best-practices/index.html index b6043f97afe..e2b51c7853c 100644 --- a/docs/v13.4.0/deployment/terraform-best-practices/index.html +++ b/docs/v13.4.0/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    Configuring the Cumulus deployment: link Restoring a previous version: link

    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/thin_egress_app/index.html b/docs/v13.4.0/deployment/thin_egress_app/index.html index 6d7849f2264..38be82e3580 100644 --- a/docs/v13.4.0/deployment/thin_egress_app/index.html +++ b/docs/v13.4.0/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v13.4.0

    Using the Thin Egress App for Cumulus distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a secret for signing Thin Egress App JWTs

    The Thin Egress App uses JWTs internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Please note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally configure a custom bucket map

    A simple config would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public

    Please note: your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally configure shared variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus core example deployment code.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/deployment/upgrade-readme/index.html b/docs/v13.4.0/deployment/upgrade-readme/index.html index 37be7274f20..48f64d31805 100644 --- a/docs/v13.4.0/deployment/upgrade-readme/index.html +++ b/docs/v13.4.0/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/development/forked-pr/index.html b/docs/v13.4.0/development/forked-pr/index.html index aabf0fdd5be..6a6515ca45d 100644 --- a/docs/v13.4.0/development/forked-pr/index.html +++ b/docs/v13.4.0/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: v13.4.0

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/development/integration-tests/index.html b/docs/v13.4.0/development/integration-tests/index.html index d1f0bf25eca..23d11b9d09e 100644 --- a/docs/v13.4.0/development/integration-tests/index.html +++ b/docs/v13.4.0/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/development/quality-and-coverage/index.html b/docs/v13.4.0/development/quality-and-coverage/index.html index a634a0aabe7..5df1a49fed7 100644 --- a/docs/v13.4.0/development/quality-and-coverage/index.html +++ b/docs/v13.4.0/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/development/release/index.html b/docs/v13.4.0/development/release/index.html index 7d81cee79d4..d479382d59f 100644 --- a/docs/v13.4.0/development/release/index.html +++ b/docs/v13.4.0/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -15,7 +15,7 @@ It's useful to use the search feature of your code editor or grep to see if there any references to the old package versions. In bash shell you can run

        find . -name package.json -exec grep -nH "@cumulus/.*[0-9]*\.[0-9]\.[0-9].*" {} \; | grep -v "@cumulus/.*MAJOR\.MINOR\.PATCH.*"

    e.g.:
    find . -name package.json -exec grep -nH "@cumulus/.*[0-9]*\.[0-9]\.[0-9].*" {} \; | grep -v "@cumulus/.*13\.1\.0.*"

    Verify that no results are returned where MAJOR, MINOR, or PATCH differ from the intended version, and no outdated -alpha or -beta versions are specified.

    3. Check Cumulus Dashboard PRs for Version Bump

    There may be unreleased changes in the Cumulus Dashboard project that rely on this unreleased Cumulus Core version.

    If there is exists a PR in the cumulus-dashboard repo with a name containing: "Version Bump for Next Cumulus API Release":

    • There will be a placeholder change-me value that should be replaced with the Cumulus Core to-be-released-version.
    • Mark that PR as ready to be reviewed.

    4. Update CHANGELOG.md

    Update the CHANGELOG.md. Put a header under the Unreleased section with the new version number and the date.

    Add a link reference for the github "compare" view at the bottom of the CHANGELOG.md, following the existing pattern. This link reference should create a link in the CHANGELOG's release header to changes in the corresponding release.

    5. Update DATA_MODEL_CHANGELOG.md

    Similar to #4, make sure the DATA_MODEL_CHANGELOG is updated if there are data model changes in the release, and the link reference at the end of the document is updated as appropriate.

    6. Update CONTRIBUTORS.md

    ./bin/update-contributors.sh
    git add CONTRIBUTORS.md

    Commit and push these changes, if any.

    7. Update Cumulus package API documentation

    Update auto-generated API documentation for any Cumulus packages that have it:

    npm run docs-build-packages

    Commit and push these changes, if any.

    8. Cut new version of Cumulus Documentation

    If this is a backport, do not create a new version of the documentation. For various reasons, we do not merge backports back to master, other than changelog notes. Documentation changes for backports will not be published to our documentation website.

    cd website
    npm run version ${release_version}
    git add .

    Where ${release_version} corresponds to the version tag v1.2.3, for example.

    Commit and push these changes.

    9. Create a pull request against the minor version branch

    1. Push the release branch (e.g. release-1.2.3) to GitHub.

    2. Create a PR against the minor version base branch (e.g. release-1.2.x).

    3. Configure Bamboo to run automated tests against this PR by finding the branch plan for the release branch (release-1.2.3) and setting only these variables:

      • GIT_PR: true
      • SKIP_AUDIT: true

      IMPORTANT: Do NOT set the PUBLISH_FLAG variable to true for this branch plan. The actual publishing of the release will be handled by a separate, manually triggered branch plan.

      Screenshot of Bamboo CI interface showing the configuration of the GIT_PR branch variable to have a value of &quot;true&quot;

    4. Verify that the Bamboo build for the PR succeeds and then merge to the minor version base branch (release-1.2.x).

      • It is safe to do a squash merge in this instance, but not required
    5. You may delete your release branch (release-1.2.3) after merging to the base branch.

    10. Create a git tag for the release

    Check out the minor version base branch (release-1.2.x) now that your changes are merged in and do a git pull.

    Ensure you are on the latest commit.

    Create and push a new git tag:

        git tag -a vMAJOR.MINOR.PATCH -m "Release MAJOR.MINOR.PATCH"
    git push origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -a v9.1.0 -m "Release 9.1.0"
    git push origin v9.1.0

    11. Publishing the release

    Publishing of new releases is handled by a custom Bamboo branch plan and is manually triggered.

    The reasons for using a separate branch plan to handle releases instead of the branch plan for the minor version (e.g. release-1.2.x) are:

    • The Bamboo build for the minor version release branch is triggered automatically on any commits to that branch, whereas we want to manually control when the release is published.
    • We want to verify that integration tests have passed on the Bamboo build for the minor version release branch before we manually trigger the release, so that we can be sure that our code is safe to release.

    If this is a new minor version branch, then you will need to create a new Bamboo branch plan for publishing the release following the instructions below:

    Creating a Bamboo branch plan for the release

    • In the Cumulus Core project (https://ci.earthdata.nasa.gov/browse/CUM-CBA), click Actions -> Configure Plan in the top right.

    • Next to Plan branch click the rightmost button that displays Create Plan Branch upon hover.

    • Click Create plan branch manually.

    • Add the values in that list. Choose a display name that makes it very clear this is a deployment branch plan. Release (minor version branch name) seems to work well (e.g. Release (1.2.x))).

      • Make sure you enter the correct branch name (e.g. release-1.2.x).
    • Important Deselect Enable Branch - if you do not do this, it will immediately fire off a build.

    • Do Immediately On the Branch Details page, enable Change trigger. Set the Trigger type to manual, this will prevent commits to the branch from triggering the build plan. You should have been redirected to the Branch Details tab after creating the plan. If not, navigate to the branch from the list where you clicked Create Plan Branch in the previous step.

    • Go to the Variables tab. Ensure that you are on your branch plan and not the master plan: You should not see a large list of configured variables, but instead a dropdown allowing you to select variables to override, and the tab title will be Branch Variables. Then set the branch variables as follow:

      • DEPLOYMENT: cumulus-from-npm-tf (except in special cases such as incompatible backport branches)
        • If this variable is not set, it will default to the deployment name for the last committer on the branch
      • USE_CACHED_BOOTSTRAP: false
      • USE_TERRAFORM_ZIPS: true (IMPORTANT: MUST be set in order to run integration tests against the .zip files published during the build so that we are actually testing our released files)
      • GIT_PR: true
      • SKIP_AUDIT: true
      • PUBLISH_FLAG: true
    • Enable the branch from the Branch Details page.

    • Run the branch using the Run button in the top right.

    Bamboo will build and run lint and unit tests against that tagged release, publish the new packages to NPM, and then run the integration tests using those newly released packages.

    12. Create a new Cumulus release on github

    The CI release scripts will automatically create a GitHub release based on the release version tag, as well as upload artifacts to the Github release for the Terraform modules provided by Cumulus. The Terraform release artifacts include:

    • A multi-module Terraform .zip artifact containing filtered copies of the tf-modules, packages, and tasks directories for use as Terraform module sources.
    • A S3 replicator module
    • A workflow module
    • A distribution API module
    • An ECS service module

    Just make sure to verify the appropriate .zip files are present on Github after the release process is complete.

    13. Merge base branch back to master

    Finally, you need to reproduce the version update changes back to master.

    If this is the latest version, you can simply create a PR to merge the minor version base branch back to master.

    Do not merge master back into the release branch since we want the release branch to just have the code from the release. Instead, create a new branch off of the release branch and merge that to master. You can freely merge master into this branch and delete it when it is merged to master.

    If this is a backport, you will need to create a PR that ports the changelog updates back to master. It is important in this changelog note to call it out as a backport. For example, fixes in backport version 1.14.5 may not be available in 1.15.0 because the fix was introduced in 1.15.3.

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d vMAJOR.MINOR.PATCH
    git push -d origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -d v9.1.0
    git push -d origin v9.1.0
    - + \ No newline at end of file diff --git a/docs/v13.4.0/docs-how-to/index.html b/docs/v13.4.0/docs-how-to/index.html index acfaf7f6d55..32380789ee3 100644 --- a/docs/v13.4.0/docs-how-to/index.html +++ b/docs/v13.4.0/docs-how-to/index.html @@ -5,13 +5,13 @@ Cumulus Documentation: How To's | Cumulus Documentation - +
    Version: v13.4.0

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve

    Note: docs-build will build the documents into website/build.

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---

    Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walk-through can be found here. It is worth noting that we would like the Documentation versions to match up directly with release versions. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiKey and an indexName by DocSearch that we include in our website/siteConfig.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for both these values to exist - DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/external-contributions/index.html b/docs/v13.4.0/external-contributions/index.html index 9103d729a76..822d6256991 100644 --- a/docs/v13.4.0/external-contributions/index.html +++ b/docs/v13.4.0/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: v13.4.0

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/faqs/index.html b/docs/v13.4.0/faqs/index.html index 21ad3911cd2..393093e0cce 100644 --- a/docs/v13.4.0/faqs/index.html +++ b/docs/v13.4.0/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: v13.4.0

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    What prerequisites are needed to setup Cumulus?

    Answer: You will need access to the AWS console and an Earthdata login before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a JIRA issue.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    Is Cumulus customizable and flexible?

    Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

    What are Terraform modules?

    Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

    Where do I find Terraform module variables

    Answer: Go here for a list of Cumulus maintained variables.

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting here.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/features/ancillary_metadata/index.html b/docs/v13.4.0/features/ancillary_metadata/index.html index 4e519f892b8..556eaa23d26 100644 --- a/docs/v13.4.0/features/ancillary_metadata/index.html +++ b/docs/v13.4.0/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v13.4.0

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/features/backup_and_restore/index.html b/docs/v13.4.0/features/backup_and_restore/index.html index 00d1d6f1ffd..84b8190a5d4 100644 --- a/docs/v13.4.0/features/backup_and_restore/index.html +++ b/docs/v13.4.0/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -52,7 +52,7 @@ writing to the old cluster.

  • Set the snapshot_identifier variable to the snapshot you wish to create, and configure the module like a new deployment, with a unique cluster_identifier

  • Deploy the module using terraform apply

  • Once deployed, verify the cluster has the expected data

  • Redeploy the data persistence and Cumulus deployments - You should not need to reconfigure either, as the secret ARN and the security group should not change, however double-check the configured values are as expected

  • - + \ No newline at end of file diff --git a/docs/v13.4.0/features/dead_letter_archive/index.html b/docs/v13.4.0/features/dead_letter_archive/index.html index 01c6d49f9c9..e8135e2afba 100644 --- a/docs/v13.4.0/features/dead_letter_archive/index.html +++ b/docs/v13.4.0/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: v13.4.0

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords. Otherwise, all Cumulus messages that fail to be reprocessed will be moved to a new archive location under the path <stackName>/dead-letter-archive/failed-sqs/<YYYY-MM-DD>.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/features/dead_letter_queues/index.html b/docs/v13.4.0/features/dead_letter_queues/index.html index ff520762e1a..a562b4e7ebf 100644 --- a/docs/v13.4.0/features/dead_letter_queues/index.html +++ b/docs/v13.4.0/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v13.4.0

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/v13.4.0/features/distribution-metrics/index.html b/docs/v13.4.0/features/distribution-metrics/index.html index 83b6a51c385..dff2844bec2 100644 --- a/docs/v13.4.0/features/distribution-metrics/index.html +++ b/docs/v13.4.0/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v13.4.0

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/v13.4.0/features/execution_payload_retention/index.html b/docs/v13.4.0/features/execution_payload_retention/index.html index 718959d42fa..dff5e2730e3 100644 --- a/docs/v13.4.0/features/execution_payload_retention/index.html +++ b/docs/v13.4.0/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v13.4.0

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in your RDS database and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/features/logging-esdis-metrics/index.html b/docs/v13.4.0/features/logging-esdis-metrics/index.html index e999d4dbae1..550bf37cb1f 100644 --- a/docs/v13.4.0/features/logging-esdis-metrics/index.html +++ b/docs/v13.4.0/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v13.4.0

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/features/replay-archived-sqs-messages/index.html b/docs/v13.4.0/features/replay-archived-sqs-messages/index.html index 6e318cdf76a..2aada192143 100644 --- a/docs/v13.4.0/features/replay-archived-sqs-messages/index.html +++ b/docs/v13.4.0/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: v13.4.0

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/features/replay-kinesis-messages/index.html b/docs/v13.4.0/features/replay-kinesis-messages/index.html index 080dc23d5aa..c392292a871 100644 --- a/docs/v13.4.0/features/replay-kinesis-messages/index.html +++ b/docs/v13.4.0/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v13.4.0

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/features/reports/index.html b/docs/v13.4.0/features/reports/index.html index abec55b8c4b..a55f5c93473 100644 --- a/docs/v13.4.0/features/reports/index.html +++ b/docs/v13.4.0/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -19,7 +19,7 @@ report generation. The data buckets will include any buckets in your Cumulus buckets configuration that have type public, protected or private.
    - + \ No newline at end of file diff --git a/docs/v13.4.0/getting-started/index.html b/docs/v13.4.0/getting-started/index.html index d0543dfffc5..90514c79734 100644 --- a/docs/v13.4.0/getting-started/index.html +++ b/docs/v13.4.0/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v13.4.0

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Deploy Cumulus and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    2. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    3. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment. For an introduction about Terraform go here.
    • Integrator Common Use Cases: Scenarios to help integrators along in the Cumulus environment.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/v13.4.0/glossary/index.html b/docs/v13.4.0/glossary/index.html index dc06cff02ec..e9c87eb8c64 100644 --- a/docs/v13.4.0/glossary/index.html +++ b/docs/v13.4.0/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: v13.4.0

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: aws.amazon.com

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see cookbook reference page.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Npm

    Node package manager.

    For more information, see npmjs.com.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (NPM)

    NPM hosted node.js packages. Cumulus packages can be found on NPM's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data and more.

    For more information, see AWS's s3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/index.html b/docs/v13.4.0/index.html index 707ed679bbc..9bfddbda80d 100644 --- a/docs/v13.4.0/index.html +++ b/docs/v13.4.0/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v13.4.0

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/integrator-guide/about-int-guide/index.html b/docs/v13.4.0/integrator-guide/about-int-guide/index.html index d6a40e59e34..a902bea72ac 100644 --- a/docs/v13.4.0/integrator-guide/about-int-guide/index.html +++ b/docs/v13.4.0/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v13.4.0

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/v13.4.0/integrator-guide/int-common-use-cases/index.html b/docs/v13.4.0/integrator-guide/int-common-use-cases/index.html index 11a93ab9106..0b1f57fd695 100644 --- a/docs/v13.4.0/integrator-guide/int-common-use-cases/index.html +++ b/docs/v13.4.0/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v13.4.0/integrator-guide/workflow-add-new-lambda/index.html b/docs/v13.4.0/integrator-guide/workflow-add-new-lambda/index.html index a9f01767206..255ba074051 100644 --- a/docs/v13.4.0/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/v13.4.0/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v13.4.0

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/integrator-guide/workflow-ts-failed-step/index.html b/docs/v13.4.0/integrator-guide/workflow-ts-failed-step/index.html index 9288e626ee8..6d744527117 100644 --- a/docs/v13.4.0/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/v13.4.0/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v13.4.0

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/v13.4.0/interfaces/index.html b/docs/v13.4.0/interfaces/index.html index 1b11a7f8383..32df83bc8d8 100644 --- a/docs/v13.4.0/interfaces/index.html +++ b/docs/v13.4.0/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v13.4.0

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/about-operator-docs/index.html b/docs/v13.4.0/operator-docs/about-operator-docs/index.html index 85a117ab110..e8a1b824817 100644 --- a/docs/v13.4.0/operator-docs/about-operator-docs/index.html +++ b/docs/v13.4.0/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v13.4.0

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/bulk-operations/index.html b/docs/v13.4.0/operator-docs/bulk-operations/index.html index 6eb21c92aa7..7fdcdb91d63 100644 --- a/docs/v13.4.0/operator-docs/bulk-operations/index.html +++ b/docs/v13.4.0/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v13.4.0

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/cmr-operations/index.html b/docs/v13.4.0/operator-docs/cmr-operations/index.html index c1104dea819..07e779ae0aa 100644 --- a/docs/v13.4.0/operator-docs/cmr-operations/index.html +++ b/docs/v13.4.0/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/create-rule-in-cumulus/index.html b/docs/v13.4.0/operator-docs/create-rule-in-cumulus/index.html index dce5fa5c9af..e4b05c80f5f 100644 --- a/docs/v13.4.0/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/v13.4.0/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v13.4.0

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/discovery-filtering/index.html b/docs/v13.4.0/operator-docs/discovery-filtering/index.html index b663e99af5e..a360820c365 100644 --- a/docs/v13.4.0/operator-docs/discovery-filtering/index.html +++ b/docs/v13.4.0/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/granule-workflows/index.html b/docs/v13.4.0/operator-docs/granule-workflows/index.html index 09396cf6404..5a4879af333 100644 --- a/docs/v13.4.0/operator-docs/granule-workflows/index.html +++ b/docs/v13.4.0/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v13.4.0

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/kinesis-stream-for-ingest/index.html b/docs/v13.4.0/operator-docs/kinesis-stream-for-ingest/index.html index a8518c773d1..ca46a671578 100644 --- a/docs/v13.4.0/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/v13.4.0/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v13.4.0

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/locating-access-logs/index.html b/docs/v13.4.0/operator-docs/locating-access-logs/index.html index 87a2a0f3a3d..e3d917decb3 100644 --- a/docs/v13.4.0/operator-docs/locating-access-logs/index.html +++ b/docs/v13.4.0/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v13.4.0

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/naming-executions/index.html b/docs/v13.4.0/operator-docs/naming-executions/index.html index a4932ea55cc..63ab8af8917 100644 --- a/docs/v13.4.0/operator-docs/naming-executions/index.html +++ b/docs/v13.4.0/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Please note: This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/ops-common-use-cases/index.html b/docs/v13.4.0/operator-docs/ops-common-use-cases/index.html index 3a71c7980e8..a62d03db5de 100644 --- a/docs/v13.4.0/operator-docs/ops-common-use-cases/index.html +++ b/docs/v13.4.0/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v13.4.0/operator-docs/trigger-workflow/index.html b/docs/v13.4.0/operator-docs/trigger-workflow/index.html index 266859f1666..ab0cf9d9dd0 100644 --- a/docs/v13.4.0/operator-docs/trigger-workflow/index.html +++ b/docs/v13.4.0/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v13.4.0

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/tasks/index.html b/docs/v13.4.0/tasks/index.html index d497514592a..128a265efe8 100644 --- a/docs/v13.4.0/tasks/index.html +++ b/docs/v13.4.0/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v13.4.0

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    NOTE: For a detailed description of each task, visit the task's README.md. Information on the input or output of a task is specified in the task's schemas directory.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/v13.4.0/team/index.html b/docs/v13.4.0/team/index.html index aecef23af8b..f2914543c6f 100644 --- a/docs/v13.4.0/team/index.html +++ b/docs/v13.4.0/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - +
    Version: v13.4.0

    Cumulus Team

    Cumulus Core Team

    Cumulus Emeritus Team

    - + \ No newline at end of file diff --git a/docs/v13.4.0/troubleshooting/index.html b/docs/v13.4.0/troubleshooting/index.html index 449c2e438ee..5321f27eec5 100644 --- a/docs/v13.4.0/troubleshooting/index.html +++ b/docs/v13.4.0/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v13.4.0

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/troubleshooting/reindex-elasticsearch/index.html b/docs/v13.4.0/troubleshooting/reindex-elasticsearch/index.html index 080cdcfffa0..f8eafa7987b 100644 --- a/docs/v13.4.0/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/v13.4.0/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/troubleshooting/rerunning-workflow-executions/index.html b/docs/v13.4.0/troubleshooting/rerunning-workflow-executions/index.html index 89f347f8372..5f2f76389dc 100644 --- a/docs/v13.4.0/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/v13.4.0/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v13.4.0

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/v13.4.0/troubleshooting/troubleshooting-deployment/index.html b/docs/v13.4.0/troubleshooting/troubleshooting-deployment/index.html index 7f6b5e9c8e7..9b9c2bf91fd 100644 --- a/docs/v13.4.0/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/v13.4.0/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/v13.4.0/upgrade-notes/cumulus_distribution_migration/index.html b/docs/v13.4.0/upgrade-notes/cumulus_distribution_migration/index.html index fc9fc38326a..b88b61829b7 100644 --- a/docs/v13.4.0/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/v13.4.0/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: v13.4.0

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/upgrade-notes/migrate_tea_standalone/index.html b/docs/v13.4.0/upgrade-notes/migrate_tea_standalone/index.html index 524445f2a19..2e823c6e86e 100644 --- a/docs/v13.4.0/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/v13.4.0/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v13.4.0

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/upgrade-notes/update-cma-2.0.2/index.html b/docs/v13.4.0/upgrade-notes/update-cma-2.0.2/index.html index cf75224ec68..67e7be701eb 100644 --- a/docs/v13.4.0/upgrade-notes/update-cma-2.0.2/index.html +++ b/docs/v13.4.0/upgrade-notes/update-cma-2.0.2/index.html @@ -5,13 +5,13 @@ Upgrade to CMA 2.0.2 | Cumulus Documentation - +
    Version: v13.4.0

    Upgrade to CMA 2.0.2

    Updating a Cumulus Deployment to CMA 2.0.2

    Background

    The Cumulus Message Adapter has been updated in release 2.0.2 to no longer utilize the AWS step function API to look up the defined name of a step function task for population in meta.workflow_tasks, but instead use an incrementing integer field.

    Additionally a bugfix was released in the form of v2.0.1/v2.0.2 following the initial 2.0.0 release, so all users should update to release 2.0.2

    The update is not tied to a particular version of Core, however the update should be done across all task components in order to ensure consistent execution records.

    Changes

    Execution Record Update

    This update functionally means that Cumulus tasks/activities using the CMA will now record a record that looks like the following in meta.workflowtasks, and more importantly in the tasks column for an execution record:

    Original

          "DiscoverGranules": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "QueueGranules": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    New

          "0": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "1": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    Actions Required

    The following should be done as part of a Cumulus stack update to utilize cumulus message adapter > 2.0.2:

    • Python tasks that utilize cumulus-message-adapter-python should be updated to use > 2.0.0, their lambdas rebuilt and Cumulus workflows reconfigured to use the updated version.

    • Python activities that utilize cumulus-process-py should be rebuilt using > 1.0.0 with updated dependencies, and have their images deployed/Cumulus configured to use the new version.

    • The cumulus-message-adapter v2.0.2 lambda layer should be made available in the deployment account, and the Cumulus deployment should be reconfigured to use it (via the cumulus_message_adapter_lambda_layer_version_arn variable in the cumulus module). This should address all Core node.js tasks that utilize the CMA, and many contributed node.js/JAVA components.

    Once the above have been done, redeploy Cumulus to apply the configuration and the updates should be live.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/upgrade-notes/update-task-file-schemas/index.html b/docs/v13.4.0/upgrade-notes/update-task-file-schemas/index.html index 785eb1f78dc..a129ff4048b 100644 --- a/docs/v13.4.0/upgrade-notes/update-task-file-schemas/index.html +++ b/docs/v13.4.0/upgrade-notes/update-task-file-schemas/index.html @@ -5,13 +5,13 @@ Updates to task granule file schemas | Cumulus Documentation - +
    Version: v13.4.0

    Updates to task granule file schemas

    Background

    Most Cumulus workflow tasks expect as input a payload of granule(s) which contain the files for each granule. Most tasks also return this same granule structure as output.

    However, up to this point, there was inconsistency in the schemas for the granule files objects expected by each task. Furthermore, there was no guarantee of consistency between granule files objects as stored in the database and the expectations of any given workflow task.

    Thus, when performing bulk granule operations which pass granules from the database into a Cumulus workflow, it was possible for there to be schema validation failures depending on which task was used to start the workflow and its particular schema.

    In order to rectify this situation, CUMULUS-2388 was filed and addressed to create a common granule files schema between nearly all of the Cumulus tasks (exceptions discussed below) and the Cumulus database. The following documentation explains the manual changes you need to make to your deployment in order to be compatible with the updated files schema.

    Updated files schema

    The updated granule files schema can be found here.

    These former properties were deprecated (with notes about how to derive the same information from the updated schema, if possible):

    • filename - concatenate the bucket and key values with a directory separator (/)
    • name - use fileName property
    • etag - ETags are no longer provided as an individual file property. Instead, a separate etags object mapping S3 URIs to ETag values is provided as output from the following workflow tasks (guidance on how to integrate this output with your workflows is provided in the Upgrading your workflows section below):
      • update-granules-cmr-metadata-file-links
      • hyrax-metadata-updates
    • fileStagingDir - no longer supported
    • url_path - no longer supported
    • duplicate_found - This property is no longer supported, however sync-granule and move-granules now produce a separate granuleDuplicates object as part of their output. The granuleDuplicates object is a map of granules by granule ID which includes the files that encountered duplicates during processing. Guidance on how to integrate granuleDuplicates information into your workflow configuration is provided below.

    Exceptions

    These workflow tasks did not have their schema for granule files updated:

    • discover-granules - no updates
    • queue-granules - no updates
    • parse-pdr - no updates
    • sync-granule - input schema not updated, output schema was updated

    The reason that these task schemas were not updated is that all of these tasks start before the files have been ingested to S3, thus much of the information that is required in the updated files schema like bucket, key, or checksum is not yet known.

    Bulk granule operations

    Since the input schema for the above tasks was not updated, that means you cannot run bulk granule operations against workflows if they start with any of those tasks. Bulk granule operations work by loading the specified granules from the database and sending them as input to a specified workflow, so if the specified workflow begins with a task whose input schema does not conform to what is coming out of the database, there will be schema errors.

    Upgrading your deployment

    Upgrading your workflows

    For any workflows using the update-granules-cmr-metadata-file-links task before the hyrax-metadata-updates and/or post-to-cmr tasks, update the step definition for update-granules-cmr-metadata-file-links as follows:

        "UpdateGranulesCmrMetadataFileLinksStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    hyrax-metadata-updates

    For any workflows using the hyrax-metadata-updates task before a post-to-cmr task, update the definition of the hyrax-metadata-updates step as follows:

        "HyraxMetadataUpdatesTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    post-to-cmr

    For any workflows using post-to-cmr task after the update-granules-cmr-metadata-file-links or hyrax-metadata-updates tasks, update the post-to-cmr step definition as follows:

        "CmrStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}"
    }
    }
    },
    ...more configuration...

    Example workflow

    For an example workflow integrating all of these changes, please see our example ingest and publish workflow.

    Optional - Integrate granuleDuplicates information

    Please note that the granuleDuplicates output is purely informational and does not have any bearing on the separate configuration for how duplicates should be handled.

    You can include granuleDuplicates output from the sync-granule or move-granules tasks in your workflow messages like so:

        "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    ...other config...
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granuleDuplicates}",
    "destination": "{$.meta.sync_granule.granule_duplicates}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    }
    ...more configuration...

    The result of this configuration is that the granuleDuplicates output from sync-granule would be placed in meta.sync_granule.granule_duplicates on the workflow message and remain there throughout the rest of the workflow. The same configuration could be replicated for the move-granules task, but be sure to use a different destination in the workflow message for the granuleDuplicates output .

    Updating collection URL path templates

    Collections can specify url_path templates to dynamically generate the final location of files. As part of url_path templates, file object properties can be interpolated to generate the file path. Thus, these url_path templates need to be updated to ensure that they are compatible with the updated files schema and the properties that will actually be available on file objects.

    See the notes on the updated files schema to know which properties are available and which previously existing properties were deprecated.

    As an example, you will want to update any url_path properties in your collections to remove references to file.name and replace them with references to file.fileName like so:

    - "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.name, 0, 3)}",
    + "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.fileName, 0, 3)}",
    - + \ No newline at end of file diff --git a/docs/v13.4.0/upgrade-notes/upgrade-rds/index.html b/docs/v13.4.0/upgrade-notes/upgrade-rds/index.html index a50633eb876..2ffd01ed3ea 100644 --- a/docs/v13.4.0/upgrade-notes/upgrade-rds/index.html +++ b/docs/v13.4.0/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/v13.4.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/v13.4.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html index 8f35766ab8c..3b3a3dc66c3 100644 --- a/docs/v13.4.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/v13.4.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v13.4.0

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the upgrade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflow_tasks/discover_granules/index.html b/docs/v13.4.0/workflow_tasks/discover_granules/index.html index 25c6d409be9..3e8ed1ef009 100644 --- a/docs/v13.4.0/workflow_tasks/discover_granules/index.html +++ b/docs/v13.4.0/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflow_tasks/files_to_granules/index.html b/docs/v13.4.0/workflow_tasks/files_to_granules/index.html index 398683c045d..c0a81575a49 100644 --- a/docs/v13.4.0/workflow_tasks/files_to_granules/index.html +++ b/docs/v13.4.0/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v13.4.0

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • size
    • bucket
    • key
    • fileName

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflow_tasks/lzards_backup/index.html b/docs/v13.4.0/workflow_tasks/lzards_backup/index.html index 746abc26b1d..138ba84e0c2 100644 --- a/docs/v13.4.0/workflow_tasks/lzards_backup/index.html +++ b/docs/v13.4.0/workflow_tasks/lzards_backup/index.html @@ -5,13 +5,13 @@ LZARDS Backup | Cumulus Documentation - +
    Version: v13.4.0

    LZARDS Backup

    The LZARDS backup task takes an array of granules and initiates backup requests to the LZARDS API, which will be handled asynchronously by LZARDS.

    Deployment

    The LZARDS backup task is not automatically deployed with Cumulus. To deploy the task through the Cumulus module, first you must specify a lzards_launchpad_passphrase in your terraform variables (e.g. variables.tf) like so:

    variable "lzards_launchpad_passphrase" {
    type = string
    default = ""
    }

    Then you can specify a value for your lzards_launchpad_passphrase in terraform.tfvars like so:

    lzards_launchpad_passphrase = your-passphrase

    Lastly, you need to make sure that the lzards_launchpad_passphrase is passed into the Cumulus module (in main.tf) like so:

    lzards_launchpad_passphrase  = var.lzards_launchpad_passphrase

    In short, deploying the LZARDS task requires configuring a passphrase variable and ensuring that your TF configuration passes that variable into the Cumulus module.

    Additional terraform configuration for the LZARDS task can be found in the cumulus module's variables.tf file, where the the relevant variables are prefixed with lzards_. You can add these variables to your deployment using the same process outlined above for lzards_launchpad_passphrase.

    Task Inputs

    Input

    This task expects an array of granules as input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Task Outputs

    Output

    The LZARDS task outputs a composite object containing:

    • the input granules array, and
    • a backupResults object that describes the results of LZARDS backup attempts.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflow_tasks/move_granules/index.html b/docs/v13.4.0/workflow_tasks/move_granules/index.html index da65a5b63c2..df5a158fd86 100644 --- a/docs/v13.4.0/workflow_tasks/move_granules/index.html +++ b/docs/v13.4.0/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v13.4.0

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflow_tasks/parse_pdr/index.html b/docs/v13.4.0/workflow_tasks/parse_pdr/index.html index 4e8d9b91f75..5829e2c027b 100644 --- a/docs/v13.4.0/workflow_tasks/parse_pdr/index.html +++ b/docs/v13.4.0/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v13.4.0

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflow_tasks/queue_granules/index.html b/docs/v13.4.0/workflow_tasks/queue_granules/index.html index 08e2969e49d..6f811634087 100644 --- a/docs/v13.4.0/workflow_tasks/queue_granules/index.html +++ b/docs/v13.4.0/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: v13.4.0

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflows/cumulus-task-message-flow/index.html b/docs/v13.4.0/workflows/cumulus-task-message-flow/index.html index 0a298a222ec..deda4bba95d 100644 --- a/docs/v13.4.0/workflows/cumulus-task-message-flow/index.html +++ b/docs/v13.4.0/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v13.4.0

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflows/developing-a-cumulus-workflow/index.html b/docs/v13.4.0/workflows/developing-a-cumulus-workflow/index.html index 15a1e06406e..619431a59eb 100644 --- a/docs/v13.4.0/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/v13.4.0/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v13.4.0

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflows/developing-workflow-tasks/index.html b/docs/v13.4.0/workflows/developing-workflow-tasks/index.html index 39d46189b5b..31a1ed82e0c 100644 --- a/docs/v13.4.0/workflows/developing-workflow-tasks/index.html +++ b/docs/v13.4.0/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v13.4.0

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflows/docker/index.html b/docs/v13.4.0/workflows/docker/index.html index b6dc90a3b9b..b7dd929fe3e 100644 --- a/docs/v13.4.0/workflows/docker/index.html +++ b/docs/v13.4.0/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflows/index.html b/docs/v13.4.0/workflows/index.html index fb1ef3501c6..26bf37c24c5 100644 --- a/docs/v13.4.0/workflows/index.html +++ b/docs/v13.4.0/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v13.4.0

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflows/input_output/index.html b/docs/v13.4.0/workflows/input_output/index.html index 022560bfd14..01647a63f51 100644 --- a/docs/v13.4.0/workflows/input_output/index.html +++ b/docs/v13.4.0/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v13.4.0

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflows/lambda/index.html b/docs/v13.4.0/workflows/lambda/index.html index 1fead5d6218..5d4fb30c39b 100644 --- a/docs/v13.4.0/workflows/lambda/index.html +++ b/docs/v13.4.0/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v13.4.0

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflows/protocol/index.html b/docs/v13.4.0/workflows/protocol/index.html index 405371382a4..6acbc7beebb 100644 --- a/docs/v13.4.0/workflows/protocol/index.html +++ b/docs/v13.4.0/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v13.4.0

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflows/workflow-configuration-how-to/index.html b/docs/v13.4.0/workflows/workflow-configuration-how-to/index.html index 2801f6bcb71..66f79747937 100644 --- a/docs/v13.4.0/workflows/workflow-configuration-how-to/index.html +++ b/docs/v13.4.0/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -24,7 +24,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.fileName, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/v13.4.0/workflows/workflow-triggers/index.html b/docs/v13.4.0/workflows/workflow-triggers/index.html index 53defb4d2f3..6933fa0728c 100644 --- a/docs/v13.4.0/workflows/workflow-triggers/index.html +++ b/docs/v13.4.0/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v13.4.0

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/v14.1.0/adding-a-task/index.html b/docs/v14.1.0/adding-a-task/index.html index 92abd316031..120c32c5ba6 100644 --- a/docs/v14.1.0/adding-a-task/index.html +++ b/docs/v14.1.0/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: v14.1.0

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/api/index.html b/docs/v14.1.0/api/index.html index d9593724d1f..862c5e520d3 100644 --- a/docs/v14.1.0/api/index.html +++ b/docs/v14.1.0/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v14.1.0/architecture/index.html b/docs/v14.1.0/architecture/index.html index dcdcf8c535b..22f8b1fe37a 100644 --- a/docs/v14.1.0/architecture/index.html +++ b/docs/v14.1.0/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: v14.1.0

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a set of PostgreSQL compatible database, and is exported to an Elasticsearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries. Currently the entity state data is replicated in DynamoDB and this will be removed in a future release.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Database

    Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance. For legacy reasons, Cumulus is currently also deploying a DynamoDB datastore, and writes are replicated in parallel with the PostgreSQL database writes. The DynamoDB replicated tables and parallel writes will be removed in future releases.

    PostgreSQL Database Schema Diagram

    ERD of the Cumulus Database

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/v14.1.0/configuration/cloudwatch-retention/index.html b/docs/v14.1.0/configuration/cloudwatch-retention/index.html index 0f385a7007c..423ee0470ca 100644 --- a/docs/v14.1.0/configuration/cloudwatch-retention/index.html +++ b/docs/v14.1.0/configuration/cloudwatch-retention/index.html @@ -5,13 +5,13 @@ Cloudwatch Retention | Cumulus Documentation - +
    Version: v14.1.0

    Cloudwatch Retention

    Our lambdas dump logs to AWS CloudWatch. By default, these logs exist indefinitely. However, there are ways to specify a duration for log retention.

    aws-cli

    In addition to getting your aws-cli set-up, there are two values you'll need to acquire.

    1. log-group-name: the name of the log group who's retention policy (retention time) you'd like to change. We'll use /aws/lambda/KinesisInboundLogger in our examples.
    2. retention-in-days: the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    For example, if we wanted to set log retention to 30 days on our KinesisInboundLogger lambda, we would write:

    aws logs put-retention-policy --log-group-name "/aws/lambda/KinesisInboundLogger" --retention-in-days 30

    Note: The aws-cli log command that we're using is explained in detail here.

    AWS Management Console

    Changing the log retention policy in the AWS Management Console is a fairly simple process:

    1. Navigate to the CloudWatch service in the AWS Management Console.
    2. Click on the Logs entry on the sidebar.
    3. Find the Log Group who's retention policy you're interested in changing.
    4. Click on the value in the Expire Events After column.
    5. Enter/Select the number of days you'd like to retain logs in that log group for.

    Screenshot of AWS console showing how to configure the retention period for Cloudwatch logs

    - + \ No newline at end of file diff --git a/docs/v14.1.0/configuration/collection-storage-best-practices/index.html b/docs/v14.1.0/configuration/collection-storage-best-practices/index.html index a8574fab634..13e4e08c6d7 100644 --- a/docs/v14.1.0/configuration/collection-storage-best-practices/index.html +++ b/docs/v14.1.0/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: v14.1.0

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/v14.1.0/configuration/data-management-types/index.html b/docs/v14.1.0/configuration/data-management-types/index.html index 7c923ec31a2..a3f18149118 100644 --- a/docs/v14.1.0/configuration/data-management-types/index.html +++ b/docs/v14.1.0/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: v14.1.0

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    Please note:

    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certificateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    allowedRedirectsstring[]NoOnly hosts in this list will have the provider username/password forwarded for authentication. Entries should be specified as host.com or host.com:7000 if redirect port is different than the provider port.
    certiciateUristringNoSSL Certificate S3 URI for custom or self-signed SSL (TLS) certificate
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22
    privateKeystringNofilename assumed to be in s3://bucketInternal/stackName/crypto
    cmKeyIdstringNoAWS KMS Customer Master Key arn or alias

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.fileName, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.fileName, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v14.1.0/configuration/lifecycle-policies/index.html b/docs/v14.1.0/configuration/lifecycle-policies/index.html index d7777cc6be0..24945b04add 100644 --- a/docs/v14.1.0/configuration/lifecycle-policies/index.html +++ b/docs/v14.1.0/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: v14.1.0

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walk-through on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/v14.1.0/configuration/monitoring-readme/index.html b/docs/v14.1.0/configuration/monitoring-readme/index.html index 4ac8585e278..bbff0b4fd8a 100644 --- a/docs/v14.1.0/configuration/monitoring-readme/index.html +++ b/docs/v14.1.0/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: v14.1.0

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return await Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/configuration/server_access_logging/index.html b/docs/v14.1.0/configuration/server_access_logging/index.html index 4e6fa68039d..e6782ffb004 100644 --- a/docs/v14.1.0/configuration/server_access_logging/index.html +++ b/docs/v14.1.0/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: v14.1.0

    S3 Server Access Logging

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/v14.1.0/configuration/task-configuration/index.html b/docs/v14.1.0/configuration/task-configuration/index.html index 680cbefc8fa..6b1b46d5415 100644 --- a/docs/v14.1.0/configuration/task-configuration/index.html +++ b/docs/v14.1.0/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
    Version: v14.1.0

    Configuration of Tasks

    The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

    cmr_search_client_config

    Configuration parameters for CMR search client for cumulus archive module tasks in the form:

    <lambda_identifier>_report_cmr_limit = <maximum number records can be returned from cmr-client search, this should be greater than cmr_page_size>
    <lambda_identifier>_report_cmr_page_size = <number of records for each page returned from CMR>
    type = map(string)

    More information about cmr limit and cmr page_size can be found from @cumulus/cmr-client and CMR Search API document.

    Currently the following values are supported:

    • create_reconciliation_report_cmr_limit
    • create_reconciliation_report_cmr_page_size

    Example

    cmr_search_client_config = {
    create_reconciliation_report_cmr_limit = 2500
    create_reconciliation_report_cmr_page_size = 250
    }

    elasticsearch_client_config

    Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

    <lambda_identifier>_es_scroll_duration = <duration>
    <lambda_identifier>_es_scroll_size = <size>
    type = map(string)

    Currently the following values are supported:

    • create_reconciliation_report_es_scroll_duration
    • create_reconciliation_report_es_scroll_size

    Example

    elasticsearch_client_config = {
    create_reconciliation_report_es_scroll_duration = "15m"
    create_reconciliation_report_es_scroll_size = 2000
    }

    lambda_timeouts

    A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_timeout: <timeout>
    type = map(string)

    Currently the following values are supported:

    • add_missing_file_checksums_task_timeout
    • discover_granules_task_timeout
    • discover_pdrs_task_timeout
    • fake_processing_task_timeout
    • files_to_granules_task_timeout
    • hello_world_task_timeout
    • hyrax_metadata_update_tasks_timeout
    • lzards_backup_task_timeout
    • move_granules_task_timeout
    • parse_pdr_task_timeout
    • pdr_status_check_task_timeout
    • post_to_cmr_task_timeout
    • queue_granules_task_timeout
    • queue_pdrs_task_timeout
    • queue_workflow_task_timeout
    • sf_sqs_report_task_timeout
    • sync_granule_task_timeout
    • update_granules_cmr_metadata_file_links_task_timeout

    Example

    lambda_timeouts = {
    discover_granules_task_timeout = 300
    }

    lambda_memory_sizes

    A configurable map of memory sizes (in MBs) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_memory_size: <memory_size>
    type = map(string)

    Currently the following values are supported:

    • add_missing_file_checksums_task_memory_size
    • discover_granules_task_memory_size
    • discover_pdrs_task_memory_size
    • fake_processing_task_memory_size
    • hyrax_metadata_updates_task_memory_size
    • lzards_backup_task_memory_size
    • move_granules_task_memory_size
    • parse_pdr_task_memory_size
    • pdr_status_check_task_memory_size
    • post_to_cmr_task_memory_size
    • queue_granules_task_memory_size
    • queue_pdrs_task_memory_size
    • queue_workflow_task_memory_size
    • sf_sqs_report_task_memory_size
    • sync_granule_task_memory_size
    • update_cmr_acess_constraints_task_memory_size
    • update_granules_cmr_metadata_file_links_task_memory_size

    Example

    lambda_memory_sizes = {
    queue_granules_task_memory_size = 1036
    }
    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/about-cookbooks/index.html b/docs/v14.1.0/data-cookbooks/about-cookbooks/index.html index df9ceb0c2f6..3d9417d5ac2 100644 --- a/docs/v14.1.0/data-cookbooks/about-cookbooks/index.html +++ b/docs/v14.1.0/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: v14.1.0

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to configure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/browse-generation/index.html b/docs/v14.1.0/data-cookbooks/browse-generation/index.html index a9fda9ce320..ed8a9146aad 100644 --- a/docs/v14.1.0/data-cookbooks/browse-generation/index.html +++ b/docs/v14.1.0/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "key": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.fileName, 0, 3)}",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "key": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.fileName, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 1908635
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "size": 21708
    },
    {
    "fileName": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "bucket": "cumulus-test-sandbox-internal",
    "key": "file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg"
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/choice-states/index.html b/docs/v14.1.0/data-cookbooks/choice-states/index.html index 5bab1512d01..64236681d9d 100644 --- a/docs/v14.1.0/data-cookbooks/choice-states/index.html +++ b/docs/v14.1.0/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: v14.1.0

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/cnm-workflow/index.html b/docs/v14.1.0/data-cookbooks/cnm-workflow/index.html index 7de26551346..8019aa8c476 100644 --- a/docs/v14.1.0/data-cookbooks/cnm-workflow/index.html +++ b/docs/v14.1.0/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v14.1.0

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/cnm_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }

    Please Note:

    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

    Please note: The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/error-handling/index.html b/docs/v14.1.0/data-cookbooks/error-handling/index.html index 6551b65ab4e..bcfdd354272 100644 --- a/docs/v14.1.0/data-cookbooks/error-handling/index.html +++ b/docs/v14.1.0/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/hello-world/index.html b/docs/v14.1.0/data-cookbooks/hello-world/index.html index fef947d433d..ce5fc21a41c 100644 --- a/docs/v14.1.0/data-cookbooks/hello-world/index.html +++ b/docs/v14.1.0/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: v14.1.0

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/ingest-notifications/index.html b/docs/v14.1.0/data-cookbooks/ingest-notifications/index.html index 724ae1b0833..f512f359787 100644 --- a/docs/v14.1.0/data-cookbooks/ingest-notifications/index.html +++ b/docs/v14.1.0/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: v14.1.0

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics, one for executions, granules, and PDRs, are created and used for handling notification messages related to the workflow.

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates the RDS database records for granules, executions, and PDRs. When the records are updated, messages are posted to the three SNS topics. This Lambda is invoked both when the workflow starts and when it reaches a terminal state (completion or failure).

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/queue-post-to-cmr/index.html b/docs/v14.1.0/data-cookbooks/queue-post-to-cmr/index.html index 454ff04c641..2e80ce7701a 100644 --- a/docs/v14.1.0/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/v14.1.0/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: v14.1.0

    Queue PostToCmr

    In this document, we walk through handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/v14.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index 0e1343e1498..f6e51f76f5f 100644 --- a/docs/v14.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/v14.1.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: v14.1.0

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.7.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn,
    "--lastModified",
    module.cumulus.queue_granules_task.last_modified_date
    ]
    alarms = {
    MemoryUtilizationHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 75
    }
    }
    }

    Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/sips-workflow/index.html b/docs/v14.1.0/data-cookbooks/sips-workflow/index.html index 969eeb2847a..bf51e76efc0 100644 --- a/docs/v14.1.0/data-cookbooks/sips-workflow/index.html +++ b/docs/v14.1.0/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/throttling-queued-executions/index.html b/docs/v14.1.0/data-cookbooks/throttling-queued-executions/index.html index 498dd61021f..b0d733f9859 100644 --- a/docs/v14.1.0/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/v14.1.0/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: v14.1.0

    Throttling queued executions

    In this entry, we will walk through how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/v14.1.0/data-cookbooks/tracking-files/index.html b/docs/v14.1.0/data-cookbooks/tracking-files/index.html index 381529a6059..ea6a63950d4 100644 --- a/docs/v14.1.0/data-cookbooks/tracking-files/index.html +++ b/docs/v14.1.0/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/api-gateway-logging/index.html b/docs/v14.1.0/deployment/api-gateway-logging/index.html index 179c41dcf4c..097c418655b 100644 --- a/docs/v14.1.0/deployment/api-gateway-logging/index.html +++ b/docs/v14.1.0/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: v14.1.0

    API Gateway Logging

    Enabling API Gateway Logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions: Enabling Account Level Logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    1. Create a policy document

      The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

      Save this snippet as apigateway-policy.json.

      {
      "Version": "2012-10-17",
      "Statement": [
      {
      "Sid": "",
      "Effect": "Allow",
      "Principal": {
      "Service": "apigateway.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
      }
      ]
      }
    2. Create an account role to act as ApiGateway and write to CloudWatchLogs

      NASA users in NGAP: be sure to use your account's permission boundary.

      aws iam create-role \
      --role-name ApiGatewayToCloudWatchLogs \
      [--permissions-boundary <permissionBoundaryArn>] \
      --assume-role-policy-document file://apigateway-policy.json

      Note the ARN of the returned role for the last step.

    3. Attach correct permissions to role

      Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

      aws iam attach-role-policy \
      --role-name ApiGatewayToCloudWatchLogs \
      --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"
    4. Update Account API Gateway settings with correct permissions

      Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

      aws apigateway update-account \
      --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    For details about configuring the API Gateway CloudWatch Logs delivery, see Configure Cloudwatch Logs Delivery.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/choosing_configuring_rds/index.html b/docs/v14.1.0/deployment/choosing_configuring_rds/index.html index a746cb85f72..ba5df9db4c3 100644 --- a/docs/v14.1.0/deployment/choosing_configuring_rds/index.html +++ b/docs/v14.1.0/deployment/choosing_configuring_rds/index.html @@ -5,7 +5,7 @@ Choosing and Configuration Your RDS Database | Cumulus Documentation - + @@ -36,7 +36,7 @@ using this module to create your RDS cluster, you can configure the autoscaling timeout action, the cluster minimum and maximum capacity, and more as seen in the supported variables for the module.

    Unfortunately, Terraform currently doesn't allow specifying the autoscaling timeout itself, so that value will have to be manually configured in the AWS console or CLI.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/cloudwatch-logs-delivery/index.html b/docs/v14.1.0/deployment/cloudwatch-logs-delivery/index.html index f8cd8499634..c54aefeb094 100644 --- a/docs/v14.1.0/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/v14.1.0/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: v14.1.0

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    By default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/components/index.html b/docs/v14.1.0/deployment/components/index.html index 6d82fbd3d60..5bda43753f0 100644 --- a/docs/v14.1.0/deployment/components/index.html +++ b/docs/v14.1.0/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walk-through of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/create_bucket/index.html b/docs/v14.1.0/deployment/create_bucket/index.html index 9b6e713d5e7..bd011cdc915 100644 --- a/docs/v14.1.0/deployment/create_bucket/index.html +++ b/docs/v14.1.0/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: v14.1.0

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command Line

    Using the AWS Command Line Tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }

    ⚠️ Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web Interface

    If you prefer to use the AWS web interface instead of the command line, see AWS "Creating a Bucket" documentation.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/cumulus_distribution/index.html b/docs/v14.1.0/deployment/cumulus_distribution/index.html index ad40657a292..df86e4c05fa 100644 --- a/docs/v14.1.0/deployment/cumulus_distribution/index.html +++ b/docs/v14.1.0/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
    Version: v14.1.0

    Using the Cumulus Distribution API

    The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

    Configuring a Cumulus Distribution Deployment

    The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

    These steps assume you're using the Cumulus Deployment Template but they can also be used for custom deployments.

    To configure a deployment to use Cumulus Distribution:

    1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the "Cumulus Distribution Settings".
    2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
    3. Uncomment the Cumulus Distribution outputs in outputs.tf.
    4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

    Cognito Application and User Credentials

    The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

    Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

    Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

    • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
    • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
    • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

    Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

    Cumulus Distribution URL

    Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

    By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

    In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

    You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file and setting it to one of the following values (both are explained below):

    1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
    2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

    The following subsections explain these approaches in turn.

    Using Your Cumulus Distribution API Gateway URL as Your Distribution URL

    Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development). Here is an outline of the required steps with details provided further below:

    1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
    2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
    3. Choose an open local port on your machine (we'll use 9000 in the following example)
    4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
    5. Redeploy Cumulus
    6. Add an entry to your /etc/hosts file
    7. Add a redirect URI to Cognito via the Cognito API
    8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
    9. Add a sample file to S3 to test downloading via Cognito

    To create or import an existing key pair, you can use the AWS CLI (see AWS ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

    Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

    key_name = "<name>"
    cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

    where:

    • <name> is the name of the key pair you just added to AWS
    • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
    • <port> is your open local port of choice (9000 is typically a good choice)

    Once you save your variable changes, redeploy your cumulus-tf module.

    While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

    localhost <hostname>

    Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

    • method: POST
    • base URL: the value of your csdap_host_url Terraform variable
    • path: /authclient/updateRedirectUri
    • username: the value of your csdap_client_id Terraform variable
    • password: the value of your csdap_client_password Terraform variable
    • headers: Content-Type='application/x-www-form-urlencoded'
    • body: redirect_uri=<cumulus_distribution_url>/login

    where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

    For reference, see the Cognito Authentication Service API.

    Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

    brew install --cask session-manager-plugin --no-quarantine

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

    1. Determine your EC2 instance ID
    2. Connect to the NASA VPN
    3. Start an AWS SSM session
    4. Open an SSH tunnel
    5. Use a browser to navigate to your file

    To determine your EC2 instance ID for your Cumulus deployment, run the follow command where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

    aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text

    ⚠️ IMPORTANT: Before proceeding with the remaining steps, make sure you're connected to the NASA VPN.

    Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

    aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

    If successful, you should see output similar to the following:

    Starting session with SessionId: NGAPShApplicationDeveloper-***
    Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
    Waiting for connections...

    In another terminal window, open a tunnel with port forwarding using your chosen port from above (e.g., 9000):

    ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

    where:

    • <port> is the open local port you chose earlier (e.g., 9000)
    • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, and then next enter a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    Once you're finished testing, clean up as follows:

    1. Stop your SSH tunnel (enter Ctrl-C)
    2. Stop your AWS SSM session (enter Ctrl-C)
    3. If you like, disconnect from the NASA VPN

    While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

    Using a CloudFront URL as Your Distribution URL

    In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

    This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

    • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
    • A request to add the AWS account's VPC to the whitelist

    Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

    cumulus_distribution_url = <cloudfront_url>

    In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

    At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

    Redeploy Cumulus with your new/updated Terraform variables.

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    S3 Bucket Mapping

    An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

    The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple JSON mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    ⚠️ Note: Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Switching from the Thin Egress App to Cumulus Distribution

    If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

    Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

    There will also be downtime while your API gateway is updated.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/index.html b/docs/v14.1.0/deployment/index.html index c9c834365a2..4ea96f504bb 100644 --- a/docs/v14.1.0/deployment/index.html +++ b/docs/v14.1.0/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -19,7 +19,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Choose a Distribution API

    Cumulus can be configured to use either the Thin Egress App (TEA) or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

    ⚠️ IMPORTANT: If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    Configure the Thin Egress App

    TEA can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the TEA documentation to configure distribution in your cumulus-tf deployment.

    Configure the Cumulus Distribution API (Optional)

    If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

    ⚠️ Note: Be sure to copy the redirect URLs because you will need them to update your Earthdata application.

    Update Earthdata Application

    Add the two redirect URLs to your EarthData login application by doing the following:

    1. Login to URS
    2. Under My Applications -> Application Administration -> use the edit icon of your application
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1
    5. You may delete the placeholder url you used to create the application

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus Dashboard

    Dashboard Requirements

    Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the Dashboard repository.

    Prepare AWS

    Create S3 Bucket for Dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install Dashboard

    To install the Cumulus Dashboard, clone the repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard Versioning

    By default, the master branch will be used for Dashboard deployments. The master branch of the repository contains the most recent stable release of the Cumulus Dashboard.

    If you want to test unreleased changes to the Dashboard, use the develop branch.

    Each release/version of the Dashboard will have a tag in the Dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the Dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the Dashboard

    ⚠️ Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build your dashboard from the Cumulus Dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard Deployment

    Deploy your dashboard to S3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard --acl public-read

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the Dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and log in with a user that you had previously configured for access.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 Instances and Autoscaling Groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/postgres_database_deployment/index.html b/docs/v14.1.0/deployment/postgres_database_deployment/index.html index e5d5ae0ea04..7fde4f4788f 100644 --- a/docs/v14.1.0/deployment/postgres_database_deployment/index.html +++ b/docs/v14.1.0/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 11 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs and the Aurora Serverless V1 docs.

    Prepare Deployment Repository

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS Configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a New Repository

    Create a new repository on Github so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    ⚠️ Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS Configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

    --

    Configure and Deploy the Module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc.

    Tip: This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision User and User Database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true. This configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to.
    • prefix -- this value will be used to set a unique identifier for the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda and run it on each provision thus creating the configured database (if it does not exist), updating the user password (if that value has been changed), and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    ⚠️ Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see a similar output:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    ⚠️ Caution: If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secrets required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Next Steps

    The database cluster has been created/updated! From here you can continue to add additional user accounts, databases, and other database configuration.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/share-s3-access-logs/index.html b/docs/v14.1.0/deployment/share-s3-access-logs/index.html index a88206fdcf1..5d765ebbf54 100644 --- a/docs/v14.1.0/deployment/share-s3-access-logs/index.html +++ b/docs/v14.1.0/deployment/share-s3-access-logs/index.html @@ -5,13 +5,13 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: v14.1.0

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a Node.js package that contains a simple Lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First ensure that you have enabled S3 Server Access Logging.

    Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition as the example below:

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The Terraform source package can be found on the Cumulus GitHub Release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 Replicator as described above using the target_bucket and target_prefix provided by the Metrics team.

    The Metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/terraform-best-practices/index.html b/docs/v14.1.0/deployment/terraform-best-practices/index.html index 49cbdcb885c..a8ed164d585 100644 --- a/docs/v14.1.0/deployment/terraform-best-practices/index.html +++ b/docs/v14.1.0/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    Configuring the Cumulus deployment: link Restoring a previous version: link

    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/thin_egress_app/index.html b/docs/v14.1.0/deployment/thin_egress_app/index.html index c91b3375e15..080bff380cf 100644 --- a/docs/v14.1.0/deployment/thin_egress_app/index.html +++ b/docs/v14.1.0/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus Distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v14.1.0

    Using the Thin Egress App for Cumulus Distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA Deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a Secret for Signing Thin Egress App JWTs

    The Thin Egress App uses JSON Web Tokens (JWTs) internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    Bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple JSON mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    ⚠️ Note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally Configure a Custom Bucket Map

    A simple config would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public

    ⚠️ Note: Your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally Configure Shared Variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus Core example deployment code.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/deployment/upgrade-readme/index.html b/docs/v14.1.0/deployment/upgrade-readme/index.html index 195bc7aa411..5fdefb277ed 100644 --- a/docs/v14.1.0/deployment/upgrade-readme/index.html +++ b/docs/v14.1.0/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/development/forked-pr/index.html b/docs/v14.1.0/development/forked-pr/index.html index 503d95f8039..7d5d3d4347a 100644 --- a/docs/v14.1.0/development/forked-pr/index.html +++ b/docs/v14.1.0/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: v14.1.0

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/development/integration-tests/index.html b/docs/v14.1.0/development/integration-tests/index.html index 8219e65bfe2..82c8c916ebb 100644 --- a/docs/v14.1.0/development/integration-tests/index.html +++ b/docs/v14.1.0/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/development/quality-and-coverage/index.html b/docs/v14.1.0/development/quality-and-coverage/index.html index f8afb5dfc1c..d2a3e23377e 100644 --- a/docs/v14.1.0/development/quality-and-coverage/index.html +++ b/docs/v14.1.0/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/development/release/index.html b/docs/v14.1.0/development/release/index.html index 6a815843886..ed0f6cca439 100644 --- a/docs/v14.1.0/development/release/index.html +++ b/docs/v14.1.0/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -24,7 +24,7 @@ this is a backport and patch release on the 13.3.x series of releases. Updates that are included in the future will have a corresponding CHANGELOG entry in future releases..

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d vMAJOR.MINOR.PATCH
    git push -d origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -d v9.1.0
    git push -d origin v9.1.0
    - + \ No newline at end of file diff --git a/docs/v14.1.0/docs-how-to/index.html b/docs/v14.1.0/docs-how-to/index.html index ec90a3ae981..61c67d141e3 100644 --- a/docs/v14.1.0/docs-how-to/index.html +++ b/docs/v14.1.0/docs-how-to/index.html @@ -5,7 +5,7 @@ Cumulus Documentation: How To's | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v14.1.0

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_APP_ID, DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve
    note

    docs-build will build the documents into website/build. docs-clear will clear the documents.

    caution

    Fix any broken links reported by Docusaurus if you see the following messages during build.

    [INFO] Docusaurus found broken links!

    Exhaustive list of all broken links found:

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---

    Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walk-through can be found here. Docusaurus v2 uses snapshot approach for documentation versioning. Every versioned docs does not depends on other version. It is worth noting that we would like the Documentation versions to match up directly with release versions. However, a new versioned docs can take up a lot of repo space and require maintenance, we suggest to update existing versioned docs for minor releases when there are no significant functionality changes. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiId, apiKey and an indexName by DocSearch that we include in our website/docusaurus.config.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for these values to exist - DOCSEARCH_APP_ID, DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/external-contributions/index.html b/docs/v14.1.0/external-contributions/index.html index d7963c788ff..df3a87ff011 100644 --- a/docs/v14.1.0/external-contributions/index.html +++ b/docs/v14.1.0/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: v14.1.0

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/faqs/index.html b/docs/v14.1.0/faqs/index.html index 20b05a7503c..33a54947557 100644 --- a/docs/v14.1.0/faqs/index.html +++ b/docs/v14.1.0/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: v14.1.0

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General | Workflows | Integrators & Developers | Operators


    General

    What prerequisites are needed to setup Cumulus?
    Answer: Here is a list of the tools and access that you will need in order to get started. To maintain the up-to-date versions that we are using please visit our [Cumulus main README](https://github.com/nasa/cumulus) for details.
    • NVM for node versioning
    • AWS CLI
    • Bash
    • Docker (only required for testing)
    • docker-compose (only required for testing pip install docker-compose)
    • Python
    • pipenv

    Keep in mind you will need access to the AWS console and an Earthdata account before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    Where can I find Cumulus release notes?

    Answer: To get the latest information about updates to Cumulus go to Cumulus Versions.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a Cumulus JIRA ticket.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

    For more information on how to submit an issue or contribute to Cumulus follow our guidelines at Contributing


    Workflows

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting the Workflows section.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    Where can I find a list of workflow tasks?

    Answer: You can access a list of reusable tasks for Cumulus development at Cumulus Tasks.

    Are there any third-party workflows or applications that I can use with Cumulus?

    Answer: The Cumulus team works with various partners to help build a robust framework. You can visit our External Contributions section to see what other options are available to help you customize Cumulus for your needs.


    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    Is Cumulus customizable and flexible?

    Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

    What are Terraform modules?

    Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

    Where do I find Terraform module variables

    Answer: Go here for a list of Cumulus maintained variables.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Explore more Cumulus operator best practices and how-tos in the dedicated Operator Docs.

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/features/ancillary_metadata/index.html b/docs/v14.1.0/features/ancillary_metadata/index.html index 49aae535241..9e6261cc83b 100644 --- a/docs/v14.1.0/features/ancillary_metadata/index.html +++ b/docs/v14.1.0/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v14.1.0

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/features/backup_and_restore/index.html b/docs/v14.1.0/features/backup_and_restore/index.html index 6f64fac4925..360434539ae 100644 --- a/docs/v14.1.0/features/backup_and_restore/index.html +++ b/docs/v14.1.0/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -52,7 +52,7 @@ writing to the old cluster.

  • Set the snapshot_identifier variable to the snapshot you wish to create, and configure the module like a new deployment, with a unique cluster_identifier

  • Deploy the module using terraform apply

  • Once deployed, verify the cluster has the expected data

  • Redeploy the data persistence and Cumulus deployments - You should not need to reconfigure either, as the secret ARN and the security group should not change, however double-check the configured values are as expected

  • - + \ No newline at end of file diff --git a/docs/v14.1.0/features/dead_letter_archive/index.html b/docs/v14.1.0/features/dead_letter_archive/index.html index df55254eb2d..0cc95897deb 100644 --- a/docs/v14.1.0/features/dead_letter_archive/index.html +++ b/docs/v14.1.0/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: v14.1.0

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords. Otherwise, all Cumulus messages that fail to be reprocessed will be moved to a new archive location under the path <stackName>/dead-letter-archive/failed-sqs/<YYYY-MM-DD>.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/features/dead_letter_queues/index.html b/docs/v14.1.0/features/dead_letter_queues/index.html index d5cc568e45a..0d9d034a504 100644 --- a/docs/v14.1.0/features/dead_letter_queues/index.html +++ b/docs/v14.1.0/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v14.1.0

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/v14.1.0/features/distribution-metrics/index.html b/docs/v14.1.0/features/distribution-metrics/index.html index 1cf26e2d222..7bc7dd147f6 100644 --- a/docs/v14.1.0/features/distribution-metrics/index.html +++ b/docs/v14.1.0/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v14.1.0

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/v14.1.0/features/execution_payload_retention/index.html b/docs/v14.1.0/features/execution_payload_retention/index.html index 89a223baa08..685c10660d6 100644 --- a/docs/v14.1.0/features/execution_payload_retention/index.html +++ b/docs/v14.1.0/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v14.1.0

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in your RDS database and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/features/logging-esdis-metrics/index.html b/docs/v14.1.0/features/logging-esdis-metrics/index.html index 92adf2f49a4..7773e7eae42 100644 --- a/docs/v14.1.0/features/logging-esdis-metrics/index.html +++ b/docs/v14.1.0/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v14.1.0

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/features/replay-archived-sqs-messages/index.html b/docs/v14.1.0/features/replay-archived-sqs-messages/index.html index 064a1647d63..7147dc658aa 100644 --- a/docs/v14.1.0/features/replay-archived-sqs-messages/index.html +++ b/docs/v14.1.0/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: v14.1.0

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/features/replay-kinesis-messages/index.html b/docs/v14.1.0/features/replay-kinesis-messages/index.html index 29302f9a2a2..70cb1ce18e1 100644 --- a/docs/v14.1.0/features/replay-kinesis-messages/index.html +++ b/docs/v14.1.0/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v14.1.0

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/features/reports/index.html b/docs/v14.1.0/features/reports/index.html index 3c2cf4e377b..98f0c0a5fc8 100644 --- a/docs/v14.1.0/features/reports/index.html +++ b/docs/v14.1.0/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -19,7 +19,7 @@ report generation. The data buckets will include any buckets in your Cumulus buckets configuration that have type public, protected or private.
    - + \ No newline at end of file diff --git a/docs/v14.1.0/getting-started/index.html b/docs/v14.1.0/getting-started/index.html index 413ee64ce11..048a6aafc11 100644 --- a/docs/v14.1.0/getting-started/index.html +++ b/docs/v14.1.0/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v14.1.0

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Set up Git Secrets

    To ensure your AWS access keys and passwords are protected as you submit commits we recommend setting up Git Secrets.

    2. Deploy Cumulus Core and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    3. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    4. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment.

    For an introduction about Terraform go here.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/v14.1.0/glossary/index.html b/docs/v14.1.0/glossary/index.html index 85b6feeb349..7e89100f8f1 100644 --- a/docs/v14.1.0/glossary/index.html +++ b/docs/v14.1.0/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: v14.1.0

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: Amazon Web Services documentation.

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see Collections - Data Management Types.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page.

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Node Package Manager (npm)

    Node package manager. Often referred to as npm.

    For more information, see npm.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (npm)

    Npm hosted node.js packages. Cumulus packages can be found on npm's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data, and more.

    For more information, see AWS's S3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform.

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/index.html b/docs/v14.1.0/index.html index 7258fb15c22..49a0bf1c534 100644 --- a/docs/v14.1.0/index.html +++ b/docs/v14.1.0/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v14.1.0

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/integrator-guide/about-int-guide/index.html b/docs/v14.1.0/integrator-guide/about-int-guide/index.html index 0535ec57edd..738d89bd6b6 100644 --- a/docs/v14.1.0/integrator-guide/about-int-guide/index.html +++ b/docs/v14.1.0/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v14.1.0

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/v14.1.0/integrator-guide/int-common-use-cases/index.html b/docs/v14.1.0/integrator-guide/int-common-use-cases/index.html index aac55d7bee9..a05a71e70a4 100644 --- a/docs/v14.1.0/integrator-guide/int-common-use-cases/index.html +++ b/docs/v14.1.0/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v14.1.0/integrator-guide/workflow-add-new-lambda/index.html b/docs/v14.1.0/integrator-guide/workflow-add-new-lambda/index.html index 81c10427d43..23b798902a2 100644 --- a/docs/v14.1.0/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/v14.1.0/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v14.1.0

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/integrator-guide/workflow-ts-failed-step/index.html b/docs/v14.1.0/integrator-guide/workflow-ts-failed-step/index.html index 9c2dd8b26ea..6becf9384e7 100644 --- a/docs/v14.1.0/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/v14.1.0/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v14.1.0

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/v14.1.0/interfaces/index.html b/docs/v14.1.0/interfaces/index.html index 7f5a9117e62..8889a35715a 100644 --- a/docs/v14.1.0/interfaces/index.html +++ b/docs/v14.1.0/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v14.1.0

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/about-operator-docs/index.html b/docs/v14.1.0/operator-docs/about-operator-docs/index.html index 281f74e383b..c4fee7d4a51 100644 --- a/docs/v14.1.0/operator-docs/about-operator-docs/index.html +++ b/docs/v14.1.0/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v14.1.0

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/bulk-operations/index.html b/docs/v14.1.0/operator-docs/bulk-operations/index.html index e2518425b9e..139a9d3c959 100644 --- a/docs/v14.1.0/operator-docs/bulk-operations/index.html +++ b/docs/v14.1.0/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v14.1.0

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/cmr-operations/index.html b/docs/v14.1.0/operator-docs/cmr-operations/index.html index e7e2004dc00..bc3fae06c13 100644 --- a/docs/v14.1.0/operator-docs/cmr-operations/index.html +++ b/docs/v14.1.0/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/create-rule-in-cumulus/index.html b/docs/v14.1.0/operator-docs/create-rule-in-cumulus/index.html index b7941320ca6..b7107d27aec 100644 --- a/docs/v14.1.0/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/v14.1.0/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v14.1.0

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/discovery-filtering/index.html b/docs/v14.1.0/operator-docs/discovery-filtering/index.html index 8cb11aafd83..66c3d0ad513 100644 --- a/docs/v14.1.0/operator-docs/discovery-filtering/index.html +++ b/docs/v14.1.0/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/granule-workflows/index.html b/docs/v14.1.0/operator-docs/granule-workflows/index.html index af800447bbd..1c1eea138e8 100644 --- a/docs/v14.1.0/operator-docs/granule-workflows/index.html +++ b/docs/v14.1.0/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v14.1.0

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/kinesis-stream-for-ingest/index.html b/docs/v14.1.0/operator-docs/kinesis-stream-for-ingest/index.html index 2681458beeb..3fa678472bf 100644 --- a/docs/v14.1.0/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/v14.1.0/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v14.1.0

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/locating-access-logs/index.html b/docs/v14.1.0/operator-docs/locating-access-logs/index.html index 29225406098..81f25c88a79 100644 --- a/docs/v14.1.0/operator-docs/locating-access-logs/index.html +++ b/docs/v14.1.0/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v14.1.0

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/naming-executions/index.html b/docs/v14.1.0/operator-docs/naming-executions/index.html index ffce9a8ee33..0ac42b809a1 100644 --- a/docs/v14.1.0/operator-docs/naming-executions/index.html +++ b/docs/v14.1.0/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Please note: This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/ops-common-use-cases/index.html b/docs/v14.1.0/operator-docs/ops-common-use-cases/index.html index c126858ef41..85df4181bb7 100644 --- a/docs/v14.1.0/operator-docs/ops-common-use-cases/index.html +++ b/docs/v14.1.0/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v14.1.0/operator-docs/trigger-workflow/index.html b/docs/v14.1.0/operator-docs/trigger-workflow/index.html index 25fc74c309d..f178cafbf70 100644 --- a/docs/v14.1.0/operator-docs/trigger-workflow/index.html +++ b/docs/v14.1.0/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v14.1.0

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/tasks/index.html b/docs/v14.1.0/tasks/index.html index c903380198e..83a8002398e 100644 --- a/docs/v14.1.0/tasks/index.html +++ b/docs/v14.1.0/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v14.1.0

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/v14.1.0/team/index.html b/docs/v14.1.0/team/index.html index 09bcfe2f8c2..9ab88fb4370 100644 --- a/docs/v14.1.0/team/index.html +++ b/docs/v14.1.0/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - +
    Version: v14.1.0

    Cumulus Team

    Cumulus Core Team

    Cumulus Emeritus Team

    - + \ No newline at end of file diff --git a/docs/v14.1.0/troubleshooting/index.html b/docs/v14.1.0/troubleshooting/index.html index 898af728d1b..830b907304d 100644 --- a/docs/v14.1.0/troubleshooting/index.html +++ b/docs/v14.1.0/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v14.1.0

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/troubleshooting/reindex-elasticsearch/index.html b/docs/v14.1.0/troubleshooting/reindex-elasticsearch/index.html index 50ae0503c78..4542d5b3a6b 100644 --- a/docs/v14.1.0/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/v14.1.0/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/troubleshooting/rerunning-workflow-executions/index.html b/docs/v14.1.0/troubleshooting/rerunning-workflow-executions/index.html index 372bd9fd4f8..bb150dc972c 100644 --- a/docs/v14.1.0/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/v14.1.0/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v14.1.0

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/v14.1.0/troubleshooting/troubleshooting-deployment/index.html b/docs/v14.1.0/troubleshooting/troubleshooting-deployment/index.html index ead50ece554..0ec0ca107f8 100644 --- a/docs/v14.1.0/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/v14.1.0/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/v14.1.0/upgrade-notes/cumulus_distribution_migration/index.html b/docs/v14.1.0/upgrade-notes/cumulus_distribution_migration/index.html index a8b5f1dcd6b..71fbb447d10 100644 --- a/docs/v14.1.0/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/v14.1.0/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: v14.1.0

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/upgrade-notes/migrate_tea_standalone/index.html b/docs/v14.1.0/upgrade-notes/migrate_tea_standalone/index.html index dd65d9fbd48..8c7bd72cb15 100644 --- a/docs/v14.1.0/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/v14.1.0/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v14.1.0

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/upgrade-notes/update-cma-2.0.2/index.html b/docs/v14.1.0/upgrade-notes/update-cma-2.0.2/index.html index eb6db3a4c2d..24cc2bc50e1 100644 --- a/docs/v14.1.0/upgrade-notes/update-cma-2.0.2/index.html +++ b/docs/v14.1.0/upgrade-notes/update-cma-2.0.2/index.html @@ -5,13 +5,13 @@ Upgrade to CMA 2.0.2 | Cumulus Documentation - +
    Version: v14.1.0

    Upgrade to CMA 2.0.2

    Updating a Cumulus Deployment to CMA 2.0.2

    Background

    The Cumulus Message Adapter has been updated in release 2.0.2 to no longer utilize the AWS step function API to look up the defined name of a step function task for population in meta.workflow_tasks, but instead use an incrementing integer field.

    Additionally a bugfix was released in the form of v2.0.1/v2.0.2 following the initial 2.0.0 release, so all users should update to release 2.0.2

    The update is not tied to a particular version of Core, however the update should be done across all task components in order to ensure consistent execution records.

    Changes

    Execution Record Update

    This update functionally means that Cumulus tasks/activities using the CMA will now record a record that looks like the following in meta.workflowtasks, and more importantly in the tasks column for an execution record:

    Original

          "DiscoverGranules": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "QueueGranules": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    New

          "0": {
    "name": "jk-tf-DiscoverGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxxx:function:jk-tf-DiscoverGranules"
    },
    "1": {
    "name": "jk-tf-QueueGranules",
    "version": "$LATEST",
    "arn": "arn:aws:lambda:us-east-1:xxxx:function:jk-tf-QueueGranules"
    }

    Actions Required

    The following should be done as part of a Cumulus stack update to utilize cumulus message adapter > 2.0.2:

    • Python tasks that utilize cumulus-message-adapter-python should be updated to use > 2.0.0, their lambdas rebuilt and Cumulus workflows reconfigured to use the updated version.

    • Python activities that utilize cumulus-process-py should be rebuilt using > 1.0.0 with updated dependencies, and have their images deployed/Cumulus configured to use the new version.

    • The cumulus-message-adapter v2.0.2 lambda layer should be made available in the deployment account, and the Cumulus deployment should be reconfigured to use it (via the cumulus_message_adapter_lambda_layer_version_arn variable in the cumulus module). This should address all Core node.js tasks that utilize the CMA, and many contributed node.js/JAVA components.

    Once the above have been done, redeploy Cumulus to apply the configuration and the updates should be live.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/upgrade-notes/update-task-file-schemas/index.html b/docs/v14.1.0/upgrade-notes/update-task-file-schemas/index.html index 82ebc47e292..43c7553dfba 100644 --- a/docs/v14.1.0/upgrade-notes/update-task-file-schemas/index.html +++ b/docs/v14.1.0/upgrade-notes/update-task-file-schemas/index.html @@ -5,13 +5,13 @@ Updates to task granule file schemas | Cumulus Documentation - +
    Version: v14.1.0

    Updates to task granule file schemas

    Background

    Most Cumulus workflow tasks expect as input a payload of granule(s) which contain the files for each granule. Most tasks also return this same granule structure as output.

    However, up to this point, there was inconsistency in the schemas for the granule files objects expected by each task. Furthermore, there was no guarantee of consistency between granule files objects as stored in the database and the expectations of any given workflow task.

    Thus, when performing bulk granule operations which pass granules from the database into a Cumulus workflow, it was possible for there to be schema validation failures depending on which task was used to start the workflow and its particular schema.

    In order to rectify this situation, CUMULUS-2388 was filed and addressed to create a common granule files schema between nearly all of the Cumulus tasks (exceptions discussed below) and the Cumulus database. The following documentation explains the manual changes you need to make to your deployment in order to be compatible with the updated files schema.

    Updated files schema

    The updated granule files schema can be found here.

    These former properties were deprecated (with notes about how to derive the same information from the updated schema, if possible):

    • filename - concatenate the bucket and key values with a directory separator (/)
    • name - use fileName property
    • etag - ETags are no longer provided as an individual file property. Instead, a separate etags object mapping S3 URIs to ETag values is provided as output from the following workflow tasks (guidance on how to integrate this output with your workflows is provided in the Upgrading your workflows section below):
      • update-granules-cmr-metadata-file-links
      • hyrax-metadata-updates
    • fileStagingDir - no longer supported
    • url_path - no longer supported
    • duplicate_found - This property is no longer supported, however sync-granule and move-granules now produce a separate granuleDuplicates object as part of their output. The granuleDuplicates object is a map of granules by granule ID which includes the files that encountered duplicates during processing. Guidance on how to integrate granuleDuplicates information into your workflow configuration is provided below.

    Exceptions

    These workflow tasks did not have their schema for granule files updated:

    • discover-granules - no updates
    • queue-granules - no updates
    • parse-pdr - no updates
    • sync-granule - input schema not updated, output schema was updated

    The reason that these task schemas were not updated is that all of these tasks start before the files have been ingested to S3, thus much of the information that is required in the updated files schema like bucket, key, or checksum is not yet known.

    Bulk granule operations

    Since the input schema for the above tasks was not updated, that means you cannot run bulk granule operations against workflows if they start with any of those tasks. Bulk granule operations work by loading the specified granules from the database and sending them as input to a specified workflow, so if the specified workflow begins with a task whose input schema does not conform to what is coming out of the database, there will be schema errors.

    Upgrading your deployment

    Upgrading your workflows

    For any workflows using the update-granules-cmr-metadata-file-links task before the hyrax-metadata-updates and/or post-to-cmr tasks, update the step definition for update-granules-cmr-metadata-file-links as follows:

        "UpdateGranulesCmrMetadataFileLinksStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    hyrax-metadata-updates

    For any workflows using the hyrax-metadata-updates task before a post-to-cmr task, update the definition of the hyrax-metadata-updates step as follows:

        "HyraxMetadataUpdatesTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.etags}",
    "destination": "{$.meta.file_etags}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    ...more configuration...

    post-to-cmr

    For any workflows using post-to-cmr task after the update-granules-cmr-metadata-file-links or hyrax-metadata-updates tasks, update the post-to-cmr step definition as follows:

        "CmrStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "stack": "{$.meta.stack}",
    "cmr": "{$.meta.cmr}",
    "launchpad": "{$.meta.launchpad}",
    "etags": "{$.meta.file_etags}"
    }
    }
    },
    ...more configuration...

    Example workflow

    For an example workflow integrating all of these changes, please see our example ingest and publish workflow.

    Optional - Integrate granuleDuplicates information

    Please note that the granuleDuplicates output is purely informational and does not have any bearing on the separate configuration for how duplicates should be handled.

    You can include granuleDuplicates output from the sync-granule or move-granules tasks in your workflow messages like so:

        "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    ...other config...
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granuleDuplicates}",
    "destination": "{$.meta.sync_granule.granule_duplicates}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    }
    ...more configuration...

    The result of this configuration is that the granuleDuplicates output from sync-granule would be placed in meta.sync_granule.granule_duplicates on the workflow message and remain there throughout the rest of the workflow. The same configuration could be replicated for the move-granules task, but be sure to use a different destination in the workflow message for the granuleDuplicates output .

    Updating collection URL path templates

    Collections can specify url_path templates to dynamically generate the final location of files. As part of url_path templates, file object properties can be interpolated to generate the file path. Thus, these url_path templates need to be updated to ensure that they are compatible with the updated files schema and the properties that will actually be available on file objects.

    See the notes on the updated files schema to know which properties are available and which previously existing properties were deprecated.

    As an example, you will want to update any url_path properties in your collections to remove references to file.name and replace them with references to file.fileName like so:

    - "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.name, 0, 3)}",
    + "url_path": "{cmrMetadata.CollectionReference.ShortName}___{cmrMetadata.CollectionReference.Version}/{substring(file.fileName, 0, 3)}",
    - + \ No newline at end of file diff --git a/docs/v14.1.0/upgrade-notes/upgrade-rds/index.html b/docs/v14.1.0/upgrade-notes/upgrade-rds/index.html index f77a674ef08..d2c0ca7af6d 100644 --- a/docs/v14.1.0/upgrade-notes/upgrade-rds/index.html +++ b/docs/v14.1.0/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/v14.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/v14.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html index d5036994103..2f052a1b29f 100644 --- a/docs/v14.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/v14.1.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v14.1.0

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the upgrade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflow_tasks/discover_granules/index.html b/docs/v14.1.0/workflow_tasks/discover_granules/index.html index 20d376f32e5..d6f032f2a88 100644 --- a/docs/v14.1.0/workflow_tasks/discover_granules/index.html +++ b/docs/v14.1.0/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflow_tasks/files_to_granules/index.html b/docs/v14.1.0/workflow_tasks/files_to_granules/index.html index 31d71573d4e..dfcc005db09 100644 --- a/docs/v14.1.0/workflow_tasks/files_to_granules/index.html +++ b/docs/v14.1.0/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v14.1.0

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • size
    • bucket
    • key
    • fileName

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflow_tasks/lzards_backup/index.html b/docs/v14.1.0/workflow_tasks/lzards_backup/index.html index a671bdafbb7..84488a53918 100644 --- a/docs/v14.1.0/workflow_tasks/lzards_backup/index.html +++ b/docs/v14.1.0/workflow_tasks/lzards_backup/index.html @@ -5,13 +5,13 @@ LZARDS Backup | Cumulus Documentation - +
    Version: v14.1.0

    LZARDS Backup

    The LZARDS backup task takes an array of granules and initiates backup requests to the LZARDS API, which will be handled asynchronously by LZARDS.

    Deployment

    The LZARDS backup task is not automatically deployed with Cumulus. To deploy the task through the Cumulus module, first you must specify a lzards_launchpad_passphrase in your terraform variables (e.g. variables.tf) like so:

    variable "lzards_launchpad_passphrase" {
    type = string
    default = ""
    }

    Then you can specify a value for your lzards_launchpad_passphrase in terraform.tfvars like so:

    lzards_launchpad_passphrase = your-passphrase

    Lastly, you need to make sure that the lzards_launchpad_passphrase is passed into the Cumulus module (in main.tf) like so:

    lzards_launchpad_passphrase  = var.lzards_launchpad_passphrase

    In short, deploying the LZARDS task requires configuring a passphrase variable and ensuring that your TF configuration passes that variable into the Cumulus module.

    Additional terraform configuration for the LZARDS task can be found in the cumulus module's variables.tf file, where the the relevant variables are prefixed with lzards_. You can add these variables to your deployment using the same process outlined above for lzards_launchpad_passphrase.

    Task Inputs

    Input

    This task expects an array of granules as input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Task Outputs

    Output

    The LZARDS task outputs a composite object containing:

    • the input granules array, and
    • a backupResults object that describes the results of LZARDS backup attempts.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflow_tasks/move_granules/index.html b/docs/v14.1.0/workflow_tasks/move_granules/index.html index bf65b4531ed..d56dc1de445 100644 --- a/docs/v14.1.0/workflow_tasks/move_granules/index.html +++ b/docs/v14.1.0/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v14.1.0

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflow_tasks/parse_pdr/index.html b/docs/v14.1.0/workflow_tasks/parse_pdr/index.html index 7998710ffa0..103f1c2b7a7 100644 --- a/docs/v14.1.0/workflow_tasks/parse_pdr/index.html +++ b/docs/v14.1.0/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v14.1.0

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflow_tasks/queue_granules/index.html b/docs/v14.1.0/workflow_tasks/queue_granules/index.html index 707024648fb..f9487d867c6 100644 --- a/docs/v14.1.0/workflow_tasks/queue_granules/index.html +++ b/docs/v14.1.0/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: v14.1.0

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflows/cumulus-task-message-flow/index.html b/docs/v14.1.0/workflows/cumulus-task-message-flow/index.html index 3da476d5a71..5be2a5aa934 100644 --- a/docs/v14.1.0/workflows/cumulus-task-message-flow/index.html +++ b/docs/v14.1.0/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v14.1.0

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflows/developing-a-cumulus-workflow/index.html b/docs/v14.1.0/workflows/developing-a-cumulus-workflow/index.html index 47db8be07a7..38fddab9226 100644 --- a/docs/v14.1.0/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/v14.1.0/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v14.1.0

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflows/developing-workflow-tasks/index.html b/docs/v14.1.0/workflows/developing-workflow-tasks/index.html index f5519d64780..ff66ea7234d 100644 --- a/docs/v14.1.0/workflows/developing-workflow-tasks/index.html +++ b/docs/v14.1.0/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v14.1.0

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflows/docker/index.html b/docs/v14.1.0/workflows/docker/index.html index 255d50e5330..2a46615b231 100644 --- a/docs/v14.1.0/workflows/docker/index.html +++ b/docs/v14.1.0/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflows/index.html b/docs/v14.1.0/workflows/index.html index 2317781643a..97bbbaffda4 100644 --- a/docs/v14.1.0/workflows/index.html +++ b/docs/v14.1.0/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v14.1.0

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflows/input_output/index.html b/docs/v14.1.0/workflows/input_output/index.html index 9bb1c8119a2..98ed5230bec 100644 --- a/docs/v14.1.0/workflows/input_output/index.html +++ b/docs/v14.1.0/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v14.1.0

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflows/lambda/index.html b/docs/v14.1.0/workflows/lambda/index.html index 439383306c4..6d5f0ad54c2 100644 --- a/docs/v14.1.0/workflows/lambda/index.html +++ b/docs/v14.1.0/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v14.1.0

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflows/protocol/index.html b/docs/v14.1.0/workflows/protocol/index.html index eaa04d54802..8a420c11876 100644 --- a/docs/v14.1.0/workflows/protocol/index.html +++ b/docs/v14.1.0/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v14.1.0

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflows/workflow-configuration-how-to/index.html b/docs/v14.1.0/workflows/workflow-configuration-how-to/index.html index c802f544930..69c244f8800 100644 --- a/docs/v14.1.0/workflows/workflow-configuration-how-to/index.html +++ b/docs/v14.1.0/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -24,7 +24,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.fileName, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/v14.1.0/workflows/workflow-triggers/index.html b/docs/v14.1.0/workflows/workflow-triggers/index.html index 073a259cc6e..7a73ff1871c 100644 --- a/docs/v14.1.0/workflows/workflow-triggers/index.html +++ b/docs/v14.1.0/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v14.1.0

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/v9.0.0/adding-a-task/index.html b/docs/v9.0.0/adding-a-task/index.html index 87ff30a63a0..66c4e4ad9ce 100644 --- a/docs/v9.0.0/adding-a-task/index.html +++ b/docs/v9.0.0/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: v9.0.0

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/api/index.html b/docs/v9.0.0/api/index.html index 694a8b5ead2..c8b15e3fd0a 100644 --- a/docs/v9.0.0/api/index.html +++ b/docs/v9.0.0/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v9.0.0/architecture/index.html b/docs/v9.0.0/architecture/index.html index 447195a5c1a..10b1deea799 100644 --- a/docs/v9.0.0/architecture/index.html +++ b/docs/v9.0.0/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: v9.0.0

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a set of DynamoDB database tables, and is exported to an ElasticSearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/v9.0.0/configuration/cloudwatch-retention/index.html b/docs/v9.0.0/configuration/cloudwatch-retention/index.html index 38b21e62c5d..a971e048be9 100644 --- a/docs/v9.0.0/configuration/cloudwatch-retention/index.html +++ b/docs/v9.0.0/configuration/cloudwatch-retention/index.html @@ -5,13 +5,13 @@ Cloudwatch Retention | Cumulus Documentation - +
    Version: v9.0.0

    Cloudwatch Retention

    Our lambdas dump logs to AWS CloudWatch. By default, these logs exist indefinitely. However, there are ways to specify a duration for log retention.

    aws-cli

    In addition to getting your aws-cli set-up, there are two values you'll need to acquire.

    1. log-group-name: the name of the log group who's retention policy (retention time) you'd like to change. We'll use /aws/lambda/KinesisInboundLogger in our examples.
    2. retention-in-days: the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    For example, if we wanted to set log retention to 30 days on our KinesisInboundLogger lambda, we would write:

    aws logs put-retention-policy --log-group-name "/aws/lambda/KinesisInboundLogger" --retention-in-days 30

    Note: The aws-cli log command that we're using is explained in detail here.

    AWS Management Console

    Changing the log retention policy in the AWS Management Console is a fairly simple process:

    1. Navigate to the CloudWatch service in the AWS Management Console.
    2. Click on the Logs entry on the sidebar.
    3. Find the Log Group who's retention policy you're interested in changing.
    4. Click on the value in the Expire Events After column.
    5. Enter/Select the number of days you'd like to retain logs in that log group for.

    Screenshot of AWS console showing how to configure the retention period for Cloudwatch logs

    - + \ No newline at end of file diff --git a/docs/v9.0.0/configuration/collection-storage-best-practices/index.html b/docs/v9.0.0/configuration/collection-storage-best-practices/index.html index 1a7fff99580..e03caecc89d 100644 --- a/docs/v9.0.0/configuration/collection-storage-best-practices/index.html +++ b/docs/v9.0.0/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: v9.0.0

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/v9.0.0/configuration/data-management-types/index.html b/docs/v9.0.0/configuration/data-management-types/index.html index 2e6115c13f7..2dcec752a75 100644 --- a/docs/v9.0.0/configuration/data-management-types/index.html +++ b/docs/v9.0.0/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: v9.0.0

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    Please note:

    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.name, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.name, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v9.0.0/configuration/lifecycle-policies/index.html b/docs/v9.0.0/configuration/lifecycle-policies/index.html index ec43a19f035..8d192fa8f52 100644 --- a/docs/v9.0.0/configuration/lifecycle-policies/index.html +++ b/docs/v9.0.0/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: v9.0.0

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walkthrough on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/v9.0.0/configuration/monitoring-readme/index.html b/docs/v9.0.0/configuration/monitoring-readme/index.html index 56447bd3d4e..95bd2ea6405 100644 --- a/docs/v9.0.0/configuration/monitoring-readme/index.html +++ b/docs/v9.0.0/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: v9.0.0

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/configuration/server_access_logging/index.html b/docs/v9.0.0/configuration/server_access_logging/index.html index 94d635c6a98..5fd35f8b800 100644 --- a/docs/v9.0.0/configuration/server_access_logging/index.html +++ b/docs/v9.0.0/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: v9.0.0

    S3 Server Access Logging

    Note: To support EMS Reporting, you need to enable Amazon S3 server access logging on all protected and public buckets.

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/about-cookbooks/index.html b/docs/v9.0.0/data-cookbooks/about-cookbooks/index.html index 3e9dce3771e..5dc6d7d8d0b 100644 --- a/docs/v9.0.0/data-cookbooks/about-cookbooks/index.html +++ b/docs/v9.0.0/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: v9.0.0

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to conifgure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/browse-generation/index.html b/docs/v9.0.0/data-cookbooks/browse-generation/index.html index d95dcbcda46..893bcaa1309 100644 --- a/docs/v9.0.0/data-cookbooks/browse-generation/index.html +++ b/docs/v9.0.0/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "filepath": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "filename": "s3://cumulus-test-sandbox-protected/MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "time": 1553027415000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    "size": 1908635
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "filepath": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "filename": "s3://cumulus-test-sandbox-private/MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "time": 1553027412000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    "size": 21708
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "filepath": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "filename": "s3://cumulus-test-sandbox-protected/MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "time": 1553027415000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    "size": 1908635
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "filepath": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "filename": "s3://cumulus-test-sandbox-protected-2/MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.name, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-internal",
    "filename": "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "fileStagingDir": "file-staging/jk2/MOD09GQ___006",
    "time": 1553027415000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.name, 0, 3)}",
    "size": 1908635
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-internal",
    "filename": "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "fileStagingDir": "file-staging/jk2/MOD09GQ___006",
    "time": 1553027412000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.name, 0, 3)}",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-internal",
    "filename": "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "fileStagingDir": "file-staging/jk2/MOD09GQ___006",
    "time": 1553027415000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    "size": 1908635
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-internal",
    "filename": "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "fileStagingDir": "file-staging/jk2/MOD09GQ___006",
    "time": 1553027412000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    "size": 21708
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-internal",
    "filename": "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "fileStagingDir": "file-staging/jk2/MOD09GQ___006",
    "time": 1553027415000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/choice-states/index.html b/docs/v9.0.0/data-cookbooks/choice-states/index.html index 287ae501592..d5efb08dd37 100644 --- a/docs/v9.0.0/data-cookbooks/choice-states/index.html +++ b/docs/v9.0.0/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: v9.0.0

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/cnm-workflow/index.html b/docs/v9.0.0/data-cookbooks/cnm-workflow/index.html index a15a29e4d83..ad495b7cb17 100644 --- a/docs/v9.0.0/data-cookbooks/cnm-workflow/index.html +++ b/docs/v9.0.0/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.0.0

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/kinesis_trigger_test_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }

    Please Note:

    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

    Please note: The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/error-handling/index.html b/docs/v9.0.0/data-cookbooks/error-handling/index.html index 710b1add0a9..3df8ee78dd5 100644 --- a/docs/v9.0.0/data-cookbooks/error-handling/index.html +++ b/docs/v9.0.0/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/hello-world/index.html b/docs/v9.0.0/data-cookbooks/hello-world/index.html index c4101c00af2..f491033b0fd 100644 --- a/docs/v9.0.0/data-cookbooks/hello-world/index.html +++ b/docs/v9.0.0/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: v9.0.0

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/ingest-notifications/index.html b/docs/v9.0.0/data-cookbooks/ingest-notifications/index.html index 3afd0ed5c35..868e604ce44 100644 --- a/docs/v9.0.0/data-cookbooks/ingest-notifications/index.html +++ b/docs/v9.0.0/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: v9.0.0

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics are created and used for handling notification messages related to the workflow.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates DynamoDB. The DynamoDB events for the ExecutionsTable, GranulesTable and PdrsTable are streamed on DynamoDBStreams, which are read by the publishExecutions, publishGranules and publishPdrs Lambda functions, respectively.

    These Lambda functions publish to the three SNS topics both when the workflow starts and when it reaches a terminal state (completion or failure). The following describes how many message(s) each topic receives both on workflow start and workflow completion/failure:

    • reportExecutions - Receives 1 message per workflow execution
    • reportGranules - Receives 1 message per granule in a workflow execution
    • reportPdrs - Receives 1 message per PDR

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/queue-post-to-cmr/index.html b/docs/v9.0.0/data-cookbooks/queue-post-to-cmr/index.html index a3492551576..511f1b9c568 100644 --- a/docs/v9.0.0/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/v9.0.0/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: v9.0.0

    Queue PostToCmr

    In this document, we walktrough handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/v9.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index 496c2fe184d..e2cad32de5a 100644 --- a/docs/v9.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/v9.0.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: v9.0.0

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.7.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn
    ]
    alarms = {
    TaskCountHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 1
    }
    }
    }

    Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/sips-workflow/index.html b/docs/v9.0.0/data-cookbooks/sips-workflow/index.html index 84a69a34844..de6a4a55d4b 100644 --- a/docs/v9.0.0/data-cookbooks/sips-workflow/index.html +++ b/docs/v9.0.0/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/throttling-queued-executions/index.html b/docs/v9.0.0/data-cookbooks/throttling-queued-executions/index.html index c1265043c22..83779725c66 100644 --- a/docs/v9.0.0/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/v9.0.0/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: v9.0.0

    Throttling queued executions

    In this entry, we will walkthrough how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/v9.0.0/data-cookbooks/tracking-files/index.html b/docs/v9.0.0/data-cookbooks/tracking-files/index.html index af88d6baf1d..cc94fb69f30 100644 --- a/docs/v9.0.0/data-cookbooks/tracking-files/index.html +++ b/docs/v9.0.0/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/v9.0.0/deployment/api-gateway-logging/index.html b/docs/v9.0.0/deployment/api-gateway-logging/index.html index 1955d4ebc13..c3c9d2d21ab 100644 --- a/docs/v9.0.0/deployment/api-gateway-logging/index.html +++ b/docs/v9.0.0/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: v9.0.0

    API Gateway Logging

    Enabling API Gateway logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions for enabling account level logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    Create a policy document

    The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

    Save this snippet as apigateway-policy.json.

    {
    "Version": "2012-10-17",
    "Statement": [
    {
    "Sid": "",
    "Effect": "Allow",
    "Principal": {
    "Service": "apigateway.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
    }
    ]
    }

    Create an account role to act as ApiGateway and write to CloudWatchLogs

    NASA users in NGAP: be sure to use your account's permission boundary.

    aws iam create-role \
    --role-name ApiGatewayToCloudWatchLogs \
    [--permissions-boundary <permissionBoundaryArn>] \
    --assume-role-policy-document file://apigateway-policy.json

    Note the ARN of the returned role for the last step.

    Attach correct permissions to role

    Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

    aws iam attach-role-policy \
    --role-name ApiGatewayToCloudWatchLogs \
    --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"

    Update Account API Gateway settings with correct permissions

    Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

    aws apigateway update-account \
    --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    See Configure Cloudwatch Logs Delivery

    - + \ No newline at end of file diff --git a/docs/v9.0.0/deployment/cloudwatch-logs-delivery/index.html b/docs/v9.0.0/deployment/cloudwatch-logs-delivery/index.html index 365ac3e258b..289c0b79321 100644 --- a/docs/v9.0.0/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/v9.0.0/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: v9.0.0

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    Be default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/v9.0.0/deployment/components/index.html b/docs/v9.0.0/deployment/components/index.html index 784d89426e5..d1aa6f2396c 100644 --- a/docs/v9.0.0/deployment/components/index.html +++ b/docs/v9.0.0/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walkthrough of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/deployment/create_bucket/index.html b/docs/v9.0.0/deployment/create_bucket/index.html index 510e2d9cef9..cb011023adc 100644 --- a/docs/v9.0.0/deployment/create_bucket/index.html +++ b/docs/v9.0.0/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: v9.0.0

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command line

    Using the AWS command line tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }

    Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web interface

    See: AWS "Creating a Bucket" documentation

    - + \ No newline at end of file diff --git a/docs/v9.0.0/deployment/index.html b/docs/v9.0.0/deployment/index.html index 3ffdfe6d376..76274c5bcbe 100644 --- a/docs/v9.0.0/deployment/index.html +++ b/docs/v9.0.0/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -18,7 +18,7 @@ Terraform root modules: data-persistence and cumulus.

    The data-persistence module should be deployed first, and creates the Elasticsearch domain and DynamoDB tables. The cumulus module deploys the rest of Cumulus: distribution, API, ingest, workflows, etc. The cumulus module depends on the resources created in the data-persistence deployment.

    Each of these modules have to be deployed independently and require their own Terraform backend, variable, and output settings. The template deploy repo that was cloned previously already contains the scaffolding of the necessary files for the deployment of each module: data-persistence-tf deploys the data-persistence module and cumulus-tf deploys the cumulus module. For reference on the files that are included, see the documentation on adding components to a Terraform deployment.

    Troubleshooting

    Please see our troubleshooting documentation for any issues with your deployment when performing the upcoming steps.

    Configure and deploy the data-persistence-tf root module

    These steps should be executed in the data-persistence-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files.

    cd data-persistence-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the data-persistence module variable definitions for more detail on each variable.

    Consider the size of your Elasticsearch cluster when configuring data-persistence.

    Reminder: Elasticsearch is optional and can be disabled using include_elasticsearch = false in your terraform.tfvars. Your Cumulus dashboard will not work without Elasticsearch.

    Reminder: If you are including subnet_ids in your terraform.tfvars, Elasticsearch will need a service-linked role to deploy successfully. Follow the instructions above to create the service-linked role if you haven't already.

    Initialize Terraform

    Run terraform init3

    You should see output like:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!
    Optional: Import existing AWS resources to Terraform

    Import existing resources

    If you have an existing Cumulus deployment, you can import your existing DynamoDB tables and Elasticsearch instance to be used with your new Terraform deployment.

    To import a DynamoDB table from your existing deployment:

    terraform import module.data_persistence.aws_dynamodb_table.access_tokens_table PREFIX-AccessTokensTable

    Repeat this command for every DynamoDB table included in the data-persistence module, replacing PREFIX with the correct value for your existing deployment.

    To import the Elasticsearch instance from your existing deployment, run this command and replace PREFIX-es5vpc with the existing domain name:

    terraform import module.data_persistence.aws_elasticsearch_domain.es_vpc PREFIX-es5vpc

    You will also need to make sure to set these variables in your terraform.tfvars file:

    prefix = "PREFIX"     # must match prefix of existing deployment
    custom_domain_name = "PREFIX-es5vpc" # must match existing Elasticsearch domain name

    Note: If you are importing data resources from a previous version of Cumulus deployed using Cloudformation, then make sure DeletionPolicy: Retain is set on the data resources in the Cloudformation stack before deleting that stack. Otherwise, the imported data resources will be destroyed when you delete that stack. As of Cumulus version 1.15.0, DeletionPolicy: Retain is set by default for the data resources in the Cloudformation stack.

    Deploy

    Run terraform apply to deploy your data persistence resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like:

    Apply complete! Resources: 16 added, 0 changed, 0 destroyed.

    Outputs:

    dynamo_tables = {
    "access_tokens" = {
    "arn" = "arn:aws:dynamodb:us-east-1:12345:table/prefix-AccessTokensTable"
    "name" = "prefix-AccessTokensTable"
    }
    # ... more tables ...
    }
    elasticsearch_alarms = [
    {
    "arn" = "arn:aws:cloudwatch:us-east-1:12345:alarm:prefix-es-vpc-NodesLowAlarm"
    "name" = "prefix-es-vpc-NodesLowAlarm"
    },
    # ... more alarms ...
    ]
    elasticsearch_domain_arn = arn:aws:es:us-east-1:12345:domain/prefix-es-vpc
    elasticsearch_hostname = vpc-prefix-es-vpc-abcdef.us-east-1.es.amazonaws.com
    elasticsearch_security_group_id = sg-12345

    Your data persistence resources are now deployed.

    Deploy the Cumulus Message Adapter layer

    The Cumulus Message Adapter (CMA) is necessary for interpreting the input and output of Cumulus workflow steps. The CMA is now integrated with Cumulus workflow steps as a Lambda layer.

    To deploy a CMA layer to your account:

    1. Go to the CMA releases page and download the cumulus-message-adapter.zip for the desired release
    2. Use the AWS CLI to publish your layer:
    $ aws lambda publish-layer-version \
    --layer-name prefix-CMA-layer \
    --region us-east-1 \
    --zip-file fileb:///path/to/cumulus-message-adapter.zip

    {
    ... more output ...
    "LayerVersionArn": "arn:aws:lambda:us-east-1:1234567890:layer:prefix-CMA-layer:1",
    ... more output ...
    }

    Make sure to copy the LayerVersionArn of the deployed layer, as it will be used to configure the cumulus-tf deployment in the next step.

    Configure and deploy the cumulus-tf root module

    These steps should be executed in the cumulus-tf directory of the template repo that was cloned previously.

    cd cumulus-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the Cumulus module variable definitions for more detail on each variable.

    Notes on specific variables:

    • deploy_to_ngap: This variable controls the provisioning of certain resources and policies that are specific to an NGAP environment. If you are deploying to NGAP, you must set this variable to true.
    • prefix: The value should be the same as the prefix from the data-persistence deployment.
    • token_secret: A string value used for signing and verifying JSON Web Tokens (JWTs) issued by the API. For security purposes, it is strongly recommended that this value be a 32-character string.
    • data_persistence_remote_state_config: This object should contain the remote state values that you configured in data-persistence-tf/terraform.tf. These settings allow cumulus-tf to determine the names of the resources created in data-persistence-tf.
    • key_name (optional): The name of your key pair from setting up your key pair
    • rds_security_group: The ID of the security group used to allow access to the PostgreSQL database
    • rds_user_access_secret_arn: The ARN for the Secrets Manager secret that provides database access information
    • rds_connection_heartbeat: When using RDS/Aurora Serverless as a database backend, this should be set to true, this tells Core to always use a 'heartbeat' query when establishing a database connection to avoid spin-up timeout failures.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Configure the Thin Egress App

    The Thin Egress App is used for Cumulus distribution. Follow the steps in the documentation to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

    Note: Be sure to copy the redirect URLs, as you will use them to update your Earthdata application.

    Update Earthdata Application

    You will need to add two redirect URLs to your EarthData login application.

    1. Login to URS.
    2. Under My Applications -> Application Administration -> use the edit icon of your application.
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token.
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1.
    5. You may delete the placeholder url you used to create the application.

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus dashboard

    Dashboard Requirements

    Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the dashboard repository.

    Prepare AWS

    Create S3 bucket for dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install dashboard

    To install the dashboard, clone the Cumulus dashboard repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard versioning

    By default, the master branch will be used for dashboard deployments. The master branch of the dashboard repo contains the most recent stable release of the dashboard.

    If you want to test unreleased changes to the dashboard, use the develop branch.

    Each release/version of the dashboard will have a tag in the dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the dashboard

    Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build the dashboard from the dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard deployment

    Deploy dashboard to s3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard --acl public-read

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and login with a user that you configured for access in the Configure and Deploy the Cumulus Stack step.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing his highly individual and dependent on expected load and archive size.

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 instances and autoscaling groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/v9.0.0/deployment/postgres_database_deployment/index.html b/docs/v9.0.0/deployment/postgres_database_deployment/index.html index 03abf8a42c3..112693869ac 100644 --- a/docs/v9.0.0/deployment/postgres_database_deployment/index.html +++ b/docs/v9.0.0/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 10.2 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases, and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs, and the Aurora Serverless V1 docs.

    Prepare deployment repository

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a new repository

    Create a new repository on Github so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

    --

    Configure and deploy the module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc. This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision user and user database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true, this configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to
    • prefix -- this value will be used to set a unique identifier the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda, and run it on each provision, creating the configured database if it does not exist, updating the user password if that value has been changed, and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    Please Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see output like:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secret required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are is in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Next Steps

    The database cluster has been created/updated! From here you can continue to add additional user accounts, databases and other database configuration.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/deployment/share-s3-access-logs/index.html b/docs/v9.0.0/deployment/share-s3-access-logs/index.html index f6f91fb0ecf..e6a655ddb5e 100644 --- a/docs/v9.0.0/deployment/share-s3-access-logs/index.html +++ b/docs/v9.0.0/deployment/share-s3-access-logs/index.html @@ -5,14 +5,14 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: v9.0.0

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a node package that contains a simple lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First ensure that you have enabled S3 Server Access Logging.

    Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition. e.g.

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The terraform source package can be found on the Cumulus github release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 replicator as described above using the target_bucket and target_prefix provided by the metrics team.

    The metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/deployment/terraform-best-practices/index.html b/docs/v9.0.0/deployment/terraform-best-practices/index.html index 64a2feb279e..b55f76aece7 100644 --- a/docs/v9.0.0/deployment/terraform-best-practices/index.html +++ b/docs/v9.0.0/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    Configuring the Cumulus deployment: link Restoring a previous version: link

    - + \ No newline at end of file diff --git a/docs/v9.0.0/deployment/thin_egress_app/index.html b/docs/v9.0.0/deployment/thin_egress_app/index.html index 27dc1416fef..0c4fb2ad291 100644 --- a/docs/v9.0.0/deployment/thin_egress_app/index.html +++ b/docs/v9.0.0/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.0.0

    Using the Thin Egress App for Cumulus distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a secret for signing Thin Egress App JWTs

    The Thin Egress App uses JWTs internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Please note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally configure a custom bucket map

    A simple config would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public

    Please note: your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally configure shared variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus core example deployment code.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/deployment/upgrade-readme/index.html b/docs/v9.0.0/deployment/upgrade-readme/index.html index 17a80237c32..e9ae388e004 100644 --- a/docs/v9.0.0/deployment/upgrade-readme/index.html +++ b/docs/v9.0.0/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/development/forked-pr/index.html b/docs/v9.0.0/development/forked-pr/index.html index 9b35300ca39..c832c918c32 100644 --- a/docs/v9.0.0/development/forked-pr/index.html +++ b/docs/v9.0.0/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: v9.0.0

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/development/integration-tests/index.html b/docs/v9.0.0/development/integration-tests/index.html index eb00318bb5b..9a288f85f28 100644 --- a/docs/v9.0.0/development/integration-tests/index.html +++ b/docs/v9.0.0/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/development/quality-and-coverage/index.html b/docs/v9.0.0/development/quality-and-coverage/index.html index 8f81f5c4dee..69a6b607496 100644 --- a/docs/v9.0.0/development/quality-and-coverage/index.html +++ b/docs/v9.0.0/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/development/release/index.html b/docs/v9.0.0/development/release/index.html index 9c5b8a00af0..90be6298b95 100644 --- a/docs/v9.0.0/development/release/index.html +++ b/docs/v9.0.0/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.0.0

    Versioning and Releases

    Versioning

    We use a global versioning approach, meaning version numbers in cumulus are consistent across all packages and tasks, and semantic versioning to track major, minor, and patch version (i.e. 1.0.0). We use Lerna to manage our versioning. Any change will force lerna to increment the version of all packages.

    Read more about the semantic versioning here.

    Pre-release testing

    Note: This is only necessary when preparing a release for a new major version of Cumulus (e.g. preparing to go from 6.x.x to 7.0.0)

    Before releasing a new major version of Cumulus, we should test the deployment upgrade path from the latest release of Cumulus to the upcoming release.

    It is preferable to use the cumulus-template-deploy repo for testing the deployment, since that repo is the officially recommended deployment configuration for end users.

    You should create an entirely new deployment for this testing to replicate the end user upgrade path. Using an existing test or CI deployment would not be useful because that deployment may already have been deployed with the latest changes and not match the upgrade path for end users.

    Pre-release testing steps:

    1. Checkout the cumulus-template-deploy repo

    2. Update the deployment code to use the latest release artifacts if it wasn't done already. For example, assuming that the latest release was 5.0.1, update the deployment files as follows:

      # in data-persistence-tf/main.tf
      source = "https://github.com/nasa/cumulus/releases/download/v5.0.1/terraform-aws-cumulus.zip//tf-modules/data-persistence"

      # in cumulus-tf/main.tf
      source = "https://github.com/nasa/cumulus/releases/download/v5.0.1/terraform-aws-cumulus.zip//tf-modules/cumulus"
    3. For both the data-persistence-tf and cumulus-tf modules:

      1. Add the necessary backend configuration (terraform.tf) and variables (terraform.tfvars)
        • You should use an entirely new deployment for this testing, so make sure to use values for key in terraform.tf and prefix in terraform.tfvars that don't collide with existing deployments
      2. Run terraform init
      3. Run terraform apply
    4. Checkout the master branch of the cumulus repo

    5. Run a full bootstrap of the code: npm run bootstrap

    6. Build the pre-release artifacts: ./bamboo/create-release-artifacts.sh

    7. For both the data-persistence-tf and cumulus-tf modules:

      1. Update the deployment to use the built release artifacts:

        # in data-persistence-tf/main.tf
        source = "[path]/cumulus/terraform-aws-cumulus.zip//tf-modules/data-persistence"

        # in cumulus-tf/main.tf
        source = "/Users/mboyd/development/cumulus/terraform-aws-cumulus.zip//tf-modules/cumulus"
      2. Review the CHANGELOG.md for any pre-deployment migration steps. If there are, go through the steps and confirm that they are successful

      3. Run terraform init

      4. Run terraform apply

    8. Review the CHANGELOG.md for any post-deployment migration steps and confirm that they are successful

    9. Delete your test deployment by running terraform destroy in cumulus-tf and data-persistence-tf

    Updating Cumulus version and publishing to NPM

    1. Create a branch for the new release

    From Master

    Create a branch titled release-MAJOR.MINOR.x for the release.

    git checkout -b release-MAJOR.MINOR.x

    If creating a new major version release from master, say 5.0.0, then the branch would be named release-5.0.x. If creating a new minor version release from master, say 1.14.0 then the branch would be named release-1.14.x.

    Having a release branch for each major/minor version allows us to easily backport patches to that version.

    Push the release-MAJOR.MINOR.x branch to GitHub if it was created locally. (Commits should be even with master at this point.)

    If creating a patch release, you can check out the existing base branch.

    Then create the release branch (e.g. release-1.14.0) from the minor version base branch. For example, from the release-1.14.x branch:

    git checkout -b release-1.14.0

    Backporting

    When creating a backport, a minor version base branch should already exist on GitHub. Check out the existing minor version base branch then create a release branch from it. For example:

    # check out existing minor version base branch
    git checkout release-1.14.x
    # create new release branch for backport
    git checkout -b release-1.14.1

    2. Update the Cumulus version number

    When changes are ready to be released, the Cumulus version number must be updated.

    Lerna handles the process of deciding which version number should be used as long as the developer specifies whether the change is a major, minor, or patch change.

    To update Cumulus's version number run:

    npm run update

    Screenshot of terminal showing interactive prompt from Lerna for selecting the new release version

    Lerna will handle updating the packages and all of the dependent package version numbers. If a dependency has not been changed with the update, however, lerna will not update the version of the dependency.

    Note: Lerna will struggle to correctly update the versions on any non-standard/alpha versions (e.g. 1.17.0-alpha0). Please be sure to check any packages that are new or have been manually published since the previous release and any packages that list it as a dependency to ensure the listed versions are correct. It's useful to use the search feature of your code editor or grep to see if there any references to outdated package versions.

    3. Check Cumulus Dashboard PRs for Version Bump

    There may be unreleased changes in the Cumulus Dashboard project that rely on this unreleased Cumulus Core version.

    If there is exists a PR in the cumulus-dashboard repo with a name containing: "Version Bump for Next Cumulus API Release":

    • There will be a placeholder change-me value that should be replaced with the Cumulus Core to-be-released-version.
    • Mark that PR as ready to be reviewed.

    4. Update CHANGELOG.md

    Update the CHANGELOG.md. Put a header under the Unreleased section with the new version number and the date.

    Add a link reference for the github "compare" view at the bottom of the CHANGELOG.md, following the existing pattern. This link reference should create a link in the CHANGELOG's release header to changes in the corresponding release.

    5. Update DATA_MODEL_CHANGELOG.md

    Similar to #4, make sure the DATA_MODEL_CHANGELOG is updated if there are data model changes in the release, and the link reference at the end of the document is updated as appropriate.

    5. Update CONTRIBUTORS.md

    ./bin/update-contributors.sh
    git add CONTRIBUTORS.md

    Commit and push these changes, if any.

    6. Update Cumulus package API documentation

    Update auto-generated API documentation for any Cumulus packages that have it:

    npm run docs-build-packages

    Commit and push these changes, if any.

    7. Cut new version of Cumulus Documentation

    If this is a backport, do not create a new version of the documentation. For various reasons, we do not merge backports back to master, other than changelog notes. Documentation changes for backports will not be published to our documentation website.

    cd website
    npm run version ${release_version}
    git add .

    Where ${release_version} corresponds to the version tag v1.2.3, for example.

    Commit and push these changes.

    8. Create a pull request against the minor version branch

    1. Push the release branch (e.g. release-1.2.3) to GitHub.

    2. Create a PR against the minor version base branch (e.g. release-1.2.x).

    3. Configure Bamboo to run automated tests against this PR by finding the branch plan for the release branch (release-1.2.3) and setting only these variables:

      • GIT_PR: true
      • SKIP_AUDIT: true

      IMPORTANT: Do NOT set the PUBLISH_FLAG variable to true for this branch plan. The actual publishing of the release will be handled by a separate, manually triggered branch plan.

      Screenshot of Bamboo CI interface showing the configuration of the GIT_PR branch variable to have a value of &quot;true&quot;

    4. Verify that the Bamboo build for the PR succeeds and then merge to the minor version base branch (release-1.2.x).

      • It is safe to do a squash merge in this instance, but not required
    5. You may delete your release branch (release-1.2.3) after merging to the base branch.

    9. Create a git tag for the release

    Check out the minor version base branch now that your changes are merged in and do a git pull.

    Ensure you are on the latest commit.

    Create and push a new git tag:

    git tag -a v1.x.x -m "Release 1.x.x"
    git push origin v1.x.x

    10. Publishing the release

    Publishing of new releases is handled by a custom Bamboo branch plan and is manually triggered.

    The reasons for using a separate branch plan to handle releases instead of the branch plan for the minor version (e.g. release-1.2.x) are:

    • The Bamboo build for the minor version release branch is triggered automatically on any commits to that branch, whereas we want to manually control when the release is published.
    • We want to verify that integration tests have passed on the Bamboo build for the minor version release branch before we manually trigger the release, so that we can be sure that our code is safe to release.

    If this is a new minor version branch, then you will need to create a new Bamboo branch plan for publishing the release following the instructions below:

    Creating a Bamboo branch plan for the release

    • In the Cumulus Core project (https://ci.earthdata.nasa.gov/browse/CUM-CBA), click Actions -> Configure Plan in the top right.

    • Next to Plan branch click the rightmost button that displays Create Plan Branch upon hover.

    • Click Create plan branch manually.

    • Add the values in that list. Choose a display name that makes it very clear this is a deployment branch plan. Release (minor version branch name) seems to work well (e.g. Release (1.2.x))).

      • Make sure you enter the correct branch name (e.g. release-1.2.x).
    • Important Deselect Enable Branch - if you do not do this, it will immediately fire off a build.

    • Do Immediately On the Branch Details page, enable Change trigger. Set the Trigger type to manual, this will prevent commits to the branch from triggering the build plan. You should have been redirected to the Branch Details tab after creating the plan. If not, navigate to the branch from the list where you clicked Create Plan Branch in the previous step.

    • Go to the Variables tab. Ensure that you are on your branch plan and not the master plan: You should not see a large list of configured variables, but instead a dropdown allowing you to select variables to override, and the tab title will be Branch Variables. Then set the branch variables as follow:

      • DEPLOYMENT: cumulus-from-npm-tf (except in special cases such as incompatible backport branches)
        • If this variable is not set, it will default to the deployment name for the last committer on the branch
      • USE_CACHED_BOOTSTRAP: false
      • USE_TERRAFORM_ZIPS: true (IMPORTANT: MUST be set in order to run integration tests against the .zip files published during the build so that we are actually testing our released files)
      • GIT_PR: true
      • SKIP_AUDIT: true
      • PUBLISH_FLAG: true
    • Enable the branch from the Branch Details page.

    • Run the branch using the Run button in the top right.

    Bamboo will build and run lint, audit and unit tests against that tagged release, publish the new packages to NPM, and then run the integration tests using those newly released packages.

    11. Create a new Cumulus release on github

    The CI release scripts will automatically create a GitHub release based on the release version tag, as well as upload artifacts to the Github release for the Terraform modules provided by Cumulus. The Terraform release artifacts include:

    • A multi-module Terraform .zip artifact containing filtered copies of the tf-modules, packages, and tasks directories for use as Terraform module sources.
    • A S3 replicator module
    • A workflow module
    • A distribution API module
    • An ECS service module

    Just make sure to verify the appropriate .zip files are present on Github after the release process is complete.

    12. Merge base branch back to master

    Finally, you need to reproduce the version update changes back to master.

    If this is the latest version, you can simply create a PR to merge the minor version base branch back to master.

    IMPORTANT: Do not squash this merge. Doing so will make the "compare" view from step 4 show an incorrect diff, because the tag is linked to a specific commit on the base branch.

    If this is a backport, you will need to create a PR that ports the changelog updates back to master. It is important in this changelog note to call it out as a backport. For example, fixes in backport version 1.14.5 may not be available in 1.15.0 because the fix was introduced in 1.15.3.

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d v1.x.x
    git push -d origin v1.x.x
    - + \ No newline at end of file diff --git a/docs/v9.0.0/docs-how-to/index.html b/docs/v9.0.0/docs-how-to/index.html index 2717002c9fa..e6816f616ee 100644 --- a/docs/v9.0.0/docs-how-to/index.html +++ b/docs/v9.0.0/docs-how-to/index.html @@ -5,13 +5,13 @@ Cumulus Documentation: How To's | Cumulus Documentation - +
    Version: v9.0.0

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve

    Note: docs-build will build the documents into website/build.

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---

    Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walkthrough can be found here. It is worth noting that we would like the Documentation versions to match up directly with release versions. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiKey and an indexName by DocSearch that we include in our website/siteConfig.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for both these values to exist - DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/external-contributions/index.html b/docs/v9.0.0/external-contributions/index.html index e435b32c15a..19c559e8e16 100644 --- a/docs/v9.0.0/external-contributions/index.html +++ b/docs/v9.0.0/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: v9.0.0

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/faqs/index.html b/docs/v9.0.0/faqs/index.html index c2c23e9a6ff..43ac7398a1d 100644 --- a/docs/v9.0.0/faqs/index.html +++ b/docs/v9.0.0/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: v9.0.0

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    What prerequisites are needed to setup Cumulus?

    Answer: You will need access to the AWS console and an Earthdata login before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a JIRA issue.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting here.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/ancillary_metadata/index.html b/docs/v9.0.0/features/ancillary_metadata/index.html index 124f7c4cfe3..87217a40182 100644 --- a/docs/v9.0.0/features/ancillary_metadata/index.html +++ b/docs/v9.0.0/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.0.0

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/backup_and_restore/index.html b/docs/v9.0.0/features/backup_and_restore/index.html index f444efa1e44..442f9b1860c 100644 --- a/docs/v9.0.0/features/backup_and_restore/index.html +++ b/docs/v9.0.0/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -71,7 +71,7 @@ utilize the new cluster/security groups and redeploy.

    DynamoDB

    Backup and Restore with AWS

    You can enable point-in-time recovery (PITR) as well as create an on-demand backup for your Amazon DynamoDB tables.

    PITR provides continuous backups of your DynamoDB table data. PITR can be enabled through your Terraform deployment, the AWS console, or the AWS API. When enabled, DynamoDB maintains continuous backups of your table up to the last 35 days. You can recover a copy of that table to a previous state at any point in time from the moment you enable PITR, up to a maximum of the 35 preceding days. PITR provides continuous backups until you explicitly disable it.

    On-demand backups allow you to create backups of DynamoDB table data and its settings. You can initiate an on-demand backup at any time with a single click from the AWS Management Console or a single API call. You can restore the backups to a new DynamoDB table in the same AWS Region at any time.

    PITR gives your DynamoDB tables continuous protection from accidental writes and deletes. With PITR, you do not have to worry about creating, maintaining, or scheduling backups. You enable PITR on your table and your backup is available for restore at any point in time from the moment you enable it, up to a maximum of the 35 preceding days. For example, imagine a test script writing accidentally to a production DynamoDB table. You could recover your table to any point in time within the last 35 days.

    On-demand backups help with long-term archival requirements for regulatory compliance. On-demand backups give you full-control of managing the lifecycle of your backups, from creating as many backups as you need to retaining these for as long as you need.

    Enabling PITR during deployment

    By default, the Cumulus data-persistence module enables PITR on the default tables listed in the module's variable defaults for enable_point_in_time_tables. At the time of writing, that list includes:

    • AsyncOperationsTable
    • CollectionsTable
    • ExecutionsTable
    • FilesTable
    • GranulesTable
    • PdrsTable
    • ProvidersTable
    • RulesTable

    If you wish to change this list, simply update your deployment's data_persistence module (here in the template-deploy repository) to pass the correct list of tables.

    Restoring with PITR

    Restoring a full deployment

    If your deployment has been deleted all of your tables with PITR enabled will have had backups created automatically. You can locate these backups in the AWS console in the DynamoDb Backups Page or through the CLI by running:

    aws dynamodb list-backups --backup-type SYSTEM

    You can restore your tables to your AWS account using the following command:

    aws dynamodb restore-table-from-backup --target-table-name <prefix>-CollectionsTable --backup-arn <backup-arn>

    Where prefix matches the prefix from your data-persistence deployment. backup-arn can be found in the AWS console or by listing the backups using the command above.

    This will restore your tables to AWS. They will need to be linked to your Terraform deployment. After terraform init and before terraform apply, run the following command for each table:

    terraform import module.data_persistence.aws_dynamodb_table.collections_table <prefix>-CollectionsTable

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Terraform will now manage these tables as part of the Terraform state. Run terrform apply to generate the rest of the data-persistence deployment and then follow the instructions to deploy the cumulus deployment as normal.

    At this point the data will be in DynamoDB, but not in Elasticsearch, so nothing will be returned on the Operator dashboard or through Operator API calls. To get the data into Elasticsearch, run an index-from-database operation via the Operator API. The status of this operation can be viewed on the dashboard. When Elasticsearch is switched to the recovery index the data will be visible on the dashboard and available via the Operator API.

    Restoring an individual table

    A table can be restored to a previous state using PITR. This is easily achievable via the AWS Console by visiting the Backups tab for the table.

    A table can only be recovered to a new table name. Following the restoration of the table, the new table must be imported into Terraform.

    First, remove the old table from the Terraform state:

    terraform state rm module.data_persistence.aws_dynamodb_table.collections_table

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Then import the new table into the Terraform state:

    terraform import module.data_persistence.aws_dynamodb_table.collections_table <new-table-name>

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Your data-persistence and cumulus deployments should be redeployed so that your instance of Cumulus uses this new table. After the deployment, your Elasticsearch instance will be out of sync with your new table if there is any change in data. To resync your Elasticsearch with your database run an index-from-database operation via the Operator API. The status of this operation can be viewed on the dashboard. When Elasticsearch is switched to the new index the DynamoDB tables and Elasticsearch instance will be in sync and the correct data will be reflected on the dashboard.

    Backup and Restore with cumulus-api CLI

    cumulus-api CLI also includes a backup and restore command. The CLI backup command downloads the content of any of your DynamoDB tables to .json files. You can also use these .json files to restore the records to another DynamoDB table.

    Backup with the CLI

    To backup a table with the CLI, install the @cumulus/api package using npm, making sure to install the same version as your Cumulus deployment:

    npm install -g @cumulus/api@version

    Then run:

    cumulus-api backup --table <table-name>

    the backup will be stored at backups/<table-name>.json

    Restore with the CLI

    To restore data from a json file run the following command:

    cumulus-api restore backups/<table-name>.json --table <table-name>

    The restore can go to the in-use table and will update Elasticsearch. If an existing record exists in the table it will not be duplicated but will be updated with the record from the restore file.

    Data Backup and Restore

    Cumulus provides no core functionality to backup data stored in S3. Data disaster recovery is being developed in a separate effort here.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/data_in_dynamodb/index.html b/docs/v9.0.0/features/data_in_dynamodb/index.html index 7c938b794ca..c6f0fc29463 100644 --- a/docs/v9.0.0/features/data_in_dynamodb/index.html +++ b/docs/v9.0.0/features/data_in_dynamodb/index.html @@ -5,13 +5,13 @@ Cumulus Metadata in DynamoDB | Cumulus Documentation - +
    Version: v9.0.0

    Cumulus Metadata in DynamoDB

    @cumulus/api uses a number of methods to preserve the metadata generated in a Cumulus instance.

    All configurations and system-generated metadata is stored in DynamoDB tables except the logs. System logs are stored in the AWS CloudWatch service.

    Amazon DynamoDB stores three geographically distributed replicas of each table to enable high availability and data durability. Amazon DynamoDB runs exclusively on solid-state drives (SSDs). SSDs help AWS achieve the design goals of predictable low-latency response times for storing and accessing data at any scale.

    DynamoDB Auto Scaling

    Cumulus deployed tables from the data-persistence module are set to on-demand mode.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/dead_letter_queues/index.html b/docs/v9.0.0/features/dead_letter_queues/index.html index caaf3dc7b23..cceb9635d47 100644 --- a/docs/v9.0.0/features/dead_letter_queues/index.html +++ b/docs/v9.0.0/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v9.0.0

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch based on DynamoDB events)
    • EmsIngestReport (Daily EMS ingest report generation Lambda)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/distribution-metrics/index.html b/docs/v9.0.0/features/distribution-metrics/index.html index 6d6041cb732..15f623becbe 100644 --- a/docs/v9.0.0/features/distribution-metrics/index.html +++ b/docs/v9.0.0/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v9.0.0

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can create a Kibana index pattern associated with your Cumulus stack. The metrics team has worked out a convention with the Cumulus devlopers to ensure access to your stack's logs. The important piece is that the Kibana index pattern is created with the exact name of the prefix (stackName) with which cumulus was deployed.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    Kibana Index

    Before creating the Kibana index, verify that the Elasticsearch instance has been populated with at least one record1. Do this by visiting the Kibana endpoint and selecting Management, Elasticsearch Index Management and type the stack's prefix into the search bar. When you see an index with <prefix>-cloudwatch-YYY.MM.dd you are ready to continue. If you don't see at least one index for your stack, check to make sure you are delivering your logs to this Elasticsearch instance.

    Step 1: create the index by selecting Management, Kibana Index Patterns. Use an index pattern of <prefix>-* and continue to the Next step.

    Screenshot of Kibana console showing how to configure an index pattern to target logs from a Cumulus deployment

    Step 2: Set the Time Filter field name to @timestamp with the pulldown option. Very importantly Show advanced options to create a Custom index Pattern ID that is your <prefix>. Then Create index pattern. This important convention allows the dashboard to know which index to use to find the distribution metrics for a particular stack.

    Screenshot of Kibana console showing how to configure settings for an index pattern to target logs from a Cumulus deployment


    1. The Kibana console will not let you create an index if it doesn't match at least one Elasticsearch index.
    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/ems_reporting/index.html b/docs/v9.0.0/features/ems_reporting/index.html index 9beeb34a19b..42b2b205e3a 100644 --- a/docs/v9.0.0/features/ems_reporting/index.html +++ b/docs/v9.0.0/features/ems_reporting/index.html @@ -5,14 +5,14 @@ EMS Reporting | Cumulus Documentation - +
    Version: v9.0.0

    EMS Reporting

    Cumulus reports usage statistics to the ESDIS Metrics System (EMS).

    Collection Configuration

    By default, a collection and its related records (Ingest, Distribution etc.) will be reported to EMS if the collection exists in both Cumulus and CMR. We can also configure to not report a collection to EMS by setting the collection configuration parameter reportToEms set to false. If the collection has been reported to EMS, it can only be removed manually by the EMS team.

    Types of Reports

    Product Metadata

    Cumulus creates a nightly Product Metadata report. The Product Metadata report provides ancillary information about the products (collections) in Cumulus, and this information is required before EMS can process ingest and distribution reports.

    Ingest

    Cumulus creates three ingest related reports for EMS: Ingest, Archive and Archive Delete.

    The Ingest report contains records of all granules that have been ingested into Cumulus.

    The Archive report contains records of all granules that have been archived into Cumulus. It's similar to Ingest report.

    The Archive Delete report lists granules that were reported to the EMS and now have been deleted from Cumulus.

    A scheduled Lambda task will run nightly that generates Ingest, Archive and Archive Delete reports.

    Distribution

    Cumulus reports all data distribution requests that pass through the distribution API to EMS. In order to track these requests, S3 server access logging must be enabled on all protected buckets.

    You must manually enable logging for each bucket before distribution logging will work, see S3 Server Access Logging.

    A scheduled Lambda task will run nightly that collects distribution events and builds an EMS distribution report.

    Report Submission

    Information about requesting EMS account can be found on EMS website. Here are basic steps in order to submit reports to EMS.

    1) Get a provider account on the EMS file server and obtain access to their UAT or OPS environment

    Provide IP addresses, data provider name , contact information (primary and secondary) to EMS, and EMS will set up account and firewall rules to allow applications to send files to EMS. For Cumulus instances running on NGAP, the IP address should be the Elastic IP (IPv4 Public IP field) of the NGAP NAT Instance in EC2, and that should be the IP that EMS firewall sees for any instance in that account.

    2) Request updates on NGAP NACL

    For Cumulus instances running on NGAP, submit a NGAP service desk ticket, and specify "Exception / Enhancement” request for “Network / Whitelist” changes to the account, that will add EMS host IP to the NACL (Network Access Control List) to allow outbound traffic from NGAP Application VPCs to EMS host.

    3) Send public key to EMS. Lambda will provide private key when sftp files to EMS

    Upload the corresponding private key to s3, use system_bucket as bucket name and {prefix}/crypto/ems-private.pem as key, system_bucket and prefix are configured in your deployment's terraform.tfvars file. If a different private key file name other than ems-private.pem is used, specify it in the ems_private_key configuration in terraform.tfvars.

    4) Create a data manifest file manually and send it to EMS team, and EMS team will configure the data provider on their side. Example configuration of the data manifest file can be found in Cumulus core's example

    5) Create a data collection to send to EMS. The report will be automatically generated and submit to EMS, and this step will be deleted after CUMULUS-1273 is completed

    6) Configure the ems* configuration variables passed to the cumulus terraform module. Example configuration of the ems* variables can be found in Cumulus core's example

    If ems_submit_report is not set to true in the configuration, the reports are still generated in s3://{buckets.internal.name}/{prefix}/ems/{filename} for Product Metadata and Ingest reports, and s3://{buckets.internal.name}/{prefix}/ems-distribution/reports/{filename} for Distribution reports, but they won't be submitted to EMS.

    Submitted reports will be saved to sent folder.

    Report Status

    1. EMS processes the reports and generates error reports which it sends to the provider's point of contacts.
    2. APEX EMS Reporting system allows users access to ingest, archive, distribution, and error metrics. The user with 'power user' privilege can also view the Data Provider Status and the status of flat files.

    The operator can submit an IdMAX request in NASA Access Management System (NAMS) to get access to GSFC ESDIS Metric System (EMS).

    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/execution_payload_retention/index.html b/docs/v9.0.0/features/execution_payload_retention/index.html index f923c43df9c..9d85be19155 100644 --- a/docs/v9.0.0/features/execution_payload_retention/index.html +++ b/docs/v9.0.0/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v9.0.0

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in DynamoDB and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/lambda_versioning/index.html b/docs/v9.0.0/features/lambda_versioning/index.html index 2b556587f5f..d43677465df 100644 --- a/docs/v9.0.0/features/lambda_versioning/index.html +++ b/docs/v9.0.0/features/lambda_versioning/index.html @@ -5,13 +5,13 @@ Lambda Versioning | Cumulus Documentation - +
    Version: v9.0.0

    Lambda Versioning

    Cumulus makes use of AWS's Lambda/Alias version objects to tag and retain references to recent copies of deployed workflow lambdas.

    All Cumulus deployed lambdas in lambdas.yml will have an alias/version resource created. Lambdas with source coming from S3 must be expressly configured to take advantage of versioning.

    A reference to the most current lambda version alias will replace the unversioned lambda resource ARN in all workflows for each task that is either built via Cumulus, or defined via the uniqueIdentifier configuration key for s3 sourced lambdas.

    A configurable number of previously deployed alias/version pairs will be retained to ensure that in-progress workflows are able to complete.

    This allows for workflows to automatically reference the specific version of a lambda function they were deployed with, prevents an updated deployment of an existing lambda from being utilized in an already in-progress workflow, and retains the executed version information in the AWS step function execution record and CloudWatch logs.

    Please note that care must be exercised to not update lambda versions and redeploy frequently enough that an in-progress workflow refers to an aged-off version of a lambda, or workflows that reference such a lambda may fail.

    Please note This feature is not currently compatible with utilizing the layers key in workflow lambdas, as updates/reconfiguration of lambda layers will not result in a new version being created by kes. See CUMULUS-1197 for more information.

    ( See AWS Lambda Function Versioning and Aliases for more on lambda versions/aliases)

    Configuration

    This feature is enabled by default for all Cumulus built/deployed lambdas, as well as s3Source lambdas that are configured as described below. s3Source Lambdas that are not configured will continue to utilize an unqualified reference and will not utilize lambda versioning.

    s3Source Lambda Version Configuration

    Lambdas with s3Source defined currently require additional configuration to make use of this feature in the form of a 'uniqueIdentifier' key:

    SomeLambda:
    Handler: lambda_handler.handler
    timeout: 300
    s3Source:
    bucket: '{{some_bucket}}'
    key: path/some-lambda.zip
    uniqueIdentifier: '5dot2'
    runtime: python2.7

    That key, due to AWS constraints, must be letters ([a-zA-Z]) only.

    Changing Number of Retained Lambdas

    The default number of retained lambda versions is 1.

    This can be overridden by adding the following key to your configuration file:

    maxNumberOfRetainedLambdas: X

    where X is the number of previous versions you wish to retain.

    This feature allows a variable number of retained lambdas, however due to CloudFormation limits and current implementation constraints, that number is fairly limited.

    The WorkflowLambdaVersions sub-template is constrained to 200 total resources, in addition to only being able to output 60 aliases back to the master template. As such, the limit on the template is:

    (200/2+2*RV)-2 where RV = total number of retained versions.

    Given the available limits, the following are the pratical limits on the number of lambdas that can be configured for a given number of retained lambdas:

    • 1: 48

    • 2: 31

    • 3: 23

    Disabling Lambda Versioning

    This feature is enabled by default in the deployment package template, but can be disabled by adding the following key to your app/config.yml:

    useWorkflowLambdaVersions: false

    Disabling this feature will result in Cumulus not creating alias/version lambda resource objects, the WorkflowLambdaVersions stack will not be created and the deployed workflow lambda references will be unqualified (always referring to the latest version).

    Disabling this feature after deploying a stack with it enabled will remove the WorkflowLambdaVersions stack, remove all Cumulus defined lambda Version/Alias pairs and reset all workflows to using an unqualified lambda reference. Workflows in progress with incomplete steps that have references to versioned lambdas will fail.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/logging-esdis-metrics/index.html b/docs/v9.0.0/features/logging-esdis-metrics/index.html index cce13fa651c..7df01ad973f 100644 --- a/docs/v9.0.0/features/logging-esdis-metrics/index.html +++ b/docs/v9.0.0/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v9.0.0

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/replay-kinesis-messages/index.html b/docs/v9.0.0/features/replay-kinesis-messages/index.html index 7d2cf54f1ce..b3b7db8082a 100644 --- a/docs/v9.0.0/features/replay-kinesis-messages/index.html +++ b/docs/v9.0.0/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.0.0

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/features/reports/index.html b/docs/v9.0.0/features/reports/index.html index 6a0d73db233..85056d15939 100644 --- a/docs/v9.0.0/features/reports/index.html +++ b/docs/v9.0.0/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -16,7 +16,7 @@ Screenshot of the Dashboard Rconciliation Reports Overview page

    Viewing an inventory report will show a detailed list of collections, granules and files. Screenshot of an Inventory Report page

    Viewing a granule not found report will show a list of granules missing data Screenshot of a Granule Not Found Report page

    API

    The API also allows users to create and view reports. For more extensive API documentation, see the Cumulus API docs.

    Creating a Report via API

    Create a new inventory report with the following:

    curl --request POST https://example.com/reconciliationReports --header 'Authorization: Bearer ReplaceWithToken'

    Example response:

    {
    "message": "Report is being generated",
    "status": 202
    }

    Retrieving a Report via API

    Once a report has been generated, you can retrieve the full report.

    curl https://example.com/reconciliationReports/inventoryReport-20190305T153430508 --header 'Authorization: Bearer ReplaceWithTheToken'

    Example response:

    {
    "reportStartTime": "2019-03-05T15:34:30.508Z",
    "reportEndTime": "2019-03-05T15:34:37.243Z",
    "status": "SUCCESS",
    "error": null,
    "filesInCumulus": {
    "okCount": 40,
    "onlyInS3": [
    "s3://cumulus-test-sandbox-protected/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "s3://cumulus-test-sandbox-private/BROWSE.MYD13Q1.A2017297.h19v10.006.2017313221201.hdf"
    ],
    "onlyInDynamoDb": [
    {
    "uri": "s3://cumulus-test-sandbox-protected/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606"
    }
    ]
    },
    "collectionsInCumulusCmr": {
    "okCount": 1,
    "onlyInCumulus": [
    "L2_HR_PIXC___000"
    ],
    "onlyInCmr": [
    "MCD43A1___006",
    "MOD14A1___006"
    ]
    },
    "granulesInCumulusCmr": {
    "okCount": 3,
    "onlyInCumulus": [
    {
    "granuleId": "MOD09GQ.A3518809.ln_rVr.006.7962927138074",
    "collectionId": "MOD09GQ___006"
    },
    {
    "granuleId": "MOD09GQ.A8768252.HC4ddD.006.2077696236118",
    "collectionId": "MOD09GQ___006"
    }
    ],
    "onlyInCmr": [
    {
    "GranuleUR": "MOD09GQ.A0002421.oD4zvB.006.4281362831355",
    "ShortName": "MOD09GQ",
    "Version": "006"
    }
    ]
    },
    "filesInCumulusCmr": {
    "okCount": 11,
    "onlyInCumulus": [
    {
    "fileName": "MOD09GQ.A8722843.GTk5A3.006.4026909316904.jpeg",
    "uri": "s3://cumulus-test-sandbox-public/MOD09GQ___006/MOD/MOD09GQ.A8722843.GTk5A3.006.4026909316904.jpeg",
    "granuleId": "MOD09GQ.A8722843.GTk5A3.006.4026909316904"
    }
    ],
    "onlyInCmr": [
    {
    "URL": "https://cumulus-test-sandbox-public.s3.amazonaws.com/MOD09GQ___006/MOD/MOD09GQ.A8722843.GTk5A3.006.4026909316904_ndvi.jpg",
    "Type": "GET DATA",
    "GranuleUR": "MOD09GQ.A8722843.GTk5A3.006.4026909316904"
    }
    ]
    }
    }
    - + \ No newline at end of file diff --git a/docs/v9.0.0/getting-started/index.html b/docs/v9.0.0/getting-started/index.html index c5cdef6d5c1..b2d522b030c 100644 --- a/docs/v9.0.0/getting-started/index.html +++ b/docs/v9.0.0/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v9.0.0

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Deploy Cumulus and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    2. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    3. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: It's good to know how to manually deploy to a Cumulus sandbox environment.
    • Integrator Common Use Cases: Scenarios to help integrators along in the Cumulus environment.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/v9.0.0/glossary/index.html b/docs/v9.0.0/glossary/index.html index d494d15f7d0..bdee1dc38e2 100644 --- a/docs/v9.0.0/glossary/index.html +++ b/docs/v9.0.0/glossary/index.html @@ -5,14 +5,14 @@ Glossary | Cumulus Documentation - +
    Version: v9.0.0

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: aws.amazon.com

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line. For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see cookbook reference page.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    EMS

    ESDIS Metrics System

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Npm

    Node package manager.

    For more information, see npmjs.com.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (NPM)

    NPM hosted node.js packages. Cumulus packages can be found on NPM's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data and more.

    For more information, see AWS's s3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/index.html b/docs/v9.0.0/index.html index 8b9fdf6f5a7..0f4b12e3512 100644 --- a/docs/v9.0.0/index.html +++ b/docs/v9.0.0/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v9.0.0

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/integrator-guide/about-int-guide/index.html b/docs/v9.0.0/integrator-guide/about-int-guide/index.html index 593bbdbcbb1..919fba89ed7 100644 --- a/docs/v9.0.0/integrator-guide/about-int-guide/index.html +++ b/docs/v9.0.0/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v9.0.0

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/v9.0.0/integrator-guide/int-common-use-cases/index.html b/docs/v9.0.0/integrator-guide/int-common-use-cases/index.html index 51775d0c322..04d2c8d2258 100644 --- a/docs/v9.0.0/integrator-guide/int-common-use-cases/index.html +++ b/docs/v9.0.0/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v9.0.0/integrator-guide/workflow-add-new-lambda/index.html b/docs/v9.0.0/integrator-guide/workflow-add-new-lambda/index.html index 9ecb8e46c69..8c7d52b9344 100644 --- a/docs/v9.0.0/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/v9.0.0/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v9.0.0

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/integrator-guide/workflow-ts-failed-step/index.html b/docs/v9.0.0/integrator-guide/workflow-ts-failed-step/index.html index 16ffc7a27a2..ec2ea238366 100644 --- a/docs/v9.0.0/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/v9.0.0/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v9.0.0

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/v9.0.0/interfaces/index.html b/docs/v9.0.0/interfaces/index.html index d34e47658bb..545d9c59f98 100644 --- a/docs/v9.0.0/interfaces/index.html +++ b/docs/v9.0.0/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v9.0.0

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Note: This diagram is current of v1.18.0.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/about-operator-docs/index.html b/docs/v9.0.0/operator-docs/about-operator-docs/index.html index 0bd81fae929..d8576eac652 100644 --- a/docs/v9.0.0/operator-docs/about-operator-docs/index.html +++ b/docs/v9.0.0/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v9.0.0

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/bulk-operations/index.html b/docs/v9.0.0/operator-docs/bulk-operations/index.html index 50f596bf68b..ce98537a3a1 100644 --- a/docs/v9.0.0/operator-docs/bulk-operations/index.html +++ b/docs/v9.0.0/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v9.0.0

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/cmr-operations/index.html b/docs/v9.0.0/operator-docs/cmr-operations/index.html index d547d3b4453..57364a553f2 100644 --- a/docs/v9.0.0/operator-docs/cmr-operations/index.html +++ b/docs/v9.0.0/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/create-rule-in-cumulus/index.html b/docs/v9.0.0/operator-docs/create-rule-in-cumulus/index.html index d2aa8d921ea..b0ab6a8c578 100644 --- a/docs/v9.0.0/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/v9.0.0/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v9.0.0

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/discovery-filtering/index.html b/docs/v9.0.0/operator-docs/discovery-filtering/index.html index de1d37cec27..fc809064c89 100644 --- a/docs/v9.0.0/operator-docs/discovery-filtering/index.html +++ b/docs/v9.0.0/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/granule-workflows/index.html b/docs/v9.0.0/operator-docs/granule-workflows/index.html index 0c18f435e03..85b7cfd5268 100644 --- a/docs/v9.0.0/operator-docs/granule-workflows/index.html +++ b/docs/v9.0.0/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v9.0.0

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/kinesis-stream-for-ingest/index.html b/docs/v9.0.0/operator-docs/kinesis-stream-for-ingest/index.html index 6eb9f8544aa..cc4c0d47ef6 100644 --- a/docs/v9.0.0/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/v9.0.0/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v9.0.0

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/locating-access-logs/index.html b/docs/v9.0.0/operator-docs/locating-access-logs/index.html index d8740268660..c2b5749d583 100644 --- a/docs/v9.0.0/operator-docs/locating-access-logs/index.html +++ b/docs/v9.0.0/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v9.0.0

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/naming-executions/index.html b/docs/v9.0.0/operator-docs/naming-executions/index.html index 248c8d4d837..9298ce3ea07 100644 --- a/docs/v9.0.0/operator-docs/naming-executions/index.html +++ b/docs/v9.0.0/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to this:

    {
    "executionNamePrefix": "my-prefix"
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/ops-common-use-cases/index.html b/docs/v9.0.0/operator-docs/ops-common-use-cases/index.html index c07f077fd70..3e99a262394 100644 --- a/docs/v9.0.0/operator-docs/ops-common-use-cases/index.html +++ b/docs/v9.0.0/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v9.0.0/operator-docs/trigger-workflow/index.html b/docs/v9.0.0/operator-docs/trigger-workflow/index.html index a2ba11bfc80..33099502cdd 100644 --- a/docs/v9.0.0/operator-docs/trigger-workflow/index.html +++ b/docs/v9.0.0/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v9.0.0

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/tasks/index.html b/docs/v9.0.0/tasks/index.html index 18ed8b8d423..9fca6daf67d 100644 --- a/docs/v9.0.0/tasks/index.html +++ b/docs/v9.0.0/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v9.0.0

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/v9.0.0/team/index.html b/docs/v9.0.0/team/index.html index 92071ca597b..1c4b9a139da 100644 --- a/docs/v9.0.0/team/index.html +++ b/docs/v9.0.0/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v9.0.0/troubleshooting/index.html b/docs/v9.0.0/troubleshooting/index.html index c349e94c876..c290e05d8b9 100644 --- a/docs/v9.0.0/troubleshooting/index.html +++ b/docs/v9.0.0/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v9.0.0

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/troubleshooting/reindex-elasticsearch/index.html b/docs/v9.0.0/troubleshooting/reindex-elasticsearch/index.html index ca945d1a5b4..d52e6c8824e 100644 --- a/docs/v9.0.0/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/v9.0.0/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/troubleshooting/rerunning-workflow-executions/index.html b/docs/v9.0.0/troubleshooting/rerunning-workflow-executions/index.html index abf40a44542..6892ce2491f 100644 --- a/docs/v9.0.0/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/v9.0.0/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v9.0.0

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/v9.0.0/troubleshooting/troubleshooting-deployment/index.html b/docs/v9.0.0/troubleshooting/troubleshooting-deployment/index.html index 8c4fd6c05e5..e879352027d 100644 --- a/docs/v9.0.0/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/v9.0.0/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/v9.0.0/upgrade-notes/migrate_tea_standalone/index.html b/docs/v9.0.0/upgrade-notes/migrate_tea_standalone/index.html index 196c9bf4de3..e7a8bd62f2c 100644 --- a/docs/v9.0.0/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/v9.0.0/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v9.0.0

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/upgrade-notes/upgrade-rds/index.html b/docs/v9.0.0/upgrade-notes/upgrade-rds/index.html index 23a21e87854..f9a5d702cf8 100644 --- a/docs/v9.0.0/upgrade-notes/upgrade-rds/index.html +++ b/docs/v9.0.0/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/v9.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/v9.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html index 04433841675..608cfcbbf12 100644 --- a/docs/v9.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/v9.0.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v9.0.0

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the uprade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflow_tasks/discover_granules/index.html b/docs/v9.0.0/workflow_tasks/discover_granules/index.html index d3ff1965bc1..9b1af3e0fea 100644 --- a/docs/v9.0.0/workflow_tasks/discover_granules/index.html +++ b/docs/v9.0.0/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflow_tasks/files_to_granules/index.html b/docs/v9.0.0/workflow_tasks/files_to_granules/index.html index 8867eeef5a1..d9ffd09032f 100644 --- a/docs/v9.0.0/workflow_tasks/files_to_granules/index.html +++ b/docs/v9.0.0/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v9.0.0

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • name
    • bucket
    • filename
    • fileStagingDir

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflow_tasks/move_granules/index.html b/docs/v9.0.0/workflow_tasks/move_granules/index.html index a934a730239..7a63792a8f7 100644 --- a/docs/v9.0.0/workflow_tasks/move_granules/index.html +++ b/docs/v9.0.0/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v9.0.0

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflow_tasks/parse_pdr/index.html b/docs/v9.0.0/workflow_tasks/parse_pdr/index.html index fc2911f47f6..db4fdc58c98 100644 --- a/docs/v9.0.0/workflow_tasks/parse_pdr/index.html +++ b/docs/v9.0.0/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v9.0.0

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflows/cumulus-task-message-flow/index.html b/docs/v9.0.0/workflows/cumulus-task-message-flow/index.html index cb3d0cbfc9e..413a5de92df 100644 --- a/docs/v9.0.0/workflows/cumulus-task-message-flow/index.html +++ b/docs/v9.0.0/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v9.0.0

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflows/developing-a-cumulus-workflow/index.html b/docs/v9.0.0/workflows/developing-a-cumulus-workflow/index.html index d616ecc2ef5..451120dd377 100644 --- a/docs/v9.0.0/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/v9.0.0/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v9.0.0

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflows/developing-workflow-tasks/index.html b/docs/v9.0.0/workflows/developing-workflow-tasks/index.html index e865ace5b78..1a969695db7 100644 --- a/docs/v9.0.0/workflows/developing-workflow-tasks/index.html +++ b/docs/v9.0.0/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v9.0.0

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflows/docker/index.html b/docs/v9.0.0/workflows/docker/index.html index 33dda9eb076..2b81794fdc6 100644 --- a/docs/v9.0.0/workflows/docker/index.html +++ b/docs/v9.0.0/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflows/index.html b/docs/v9.0.0/workflows/index.html index 9491d893282..6ffbb5feb08 100644 --- a/docs/v9.0.0/workflows/index.html +++ b/docs/v9.0.0/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v9.0.0

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflows/input_output/index.html b/docs/v9.0.0/workflows/input_output/index.html index 9cf5885f437..02e81f2148e 100644 --- a/docs/v9.0.0/workflows/input_output/index.html +++ b/docs/v9.0.0/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v9.0.0

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflows/lambda/index.html b/docs/v9.0.0/workflows/lambda/index.html index ed54c97e32c..bdb7f8a5a68 100644 --- a/docs/v9.0.0/workflows/lambda/index.html +++ b/docs/v9.0.0/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v9.0.0

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflows/protocol/index.html b/docs/v9.0.0/workflows/protocol/index.html index ca57fc3d750..98be536f006 100644 --- a/docs/v9.0.0/workflows/protocol/index.html +++ b/docs/v9.0.0/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v9.0.0

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflows/workflow-configuration-how-to/index.html b/docs/v9.0.0/workflows/workflow-configuration-how-to/index.html index e86e665d180..320e6b42e4a 100644 --- a/docs/v9.0.0/workflows/workflow-configuration-how-to/index.html +++ b/docs/v9.0.0/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.0.0

    Workflow Configuration How To's

    How to specify a bucket for granules

    Bucket configuration

    Buckets configured in your deployment for the cumulus module's inputs will ultimately become part of the workflow configuration. The type property of a bucket relies on the how that bucket will be used:

    • public indicates a completely public bucket.
    • internal type is for system use.
    • protected buckets are for any information that should be behind an Earthdata Login authentication.
    • private buckets are for private data.

    Consider the following buckets configuration variable for the cumulus module for all following examples:

    buckets =  {
    internal = {
    name = "sample-internal-bucket",
    type = "internal"
    },
    private = {
    name = "sample-private-bucket",
    type = "private"
    },
    protected = {
    name = "sample-protected-bucket",
    type = "protected"
    },
    public = {
    name = "sample-public-bucket",
    type = "public"
    },
    protected-2 = {
    name = "sample-protected-bucket-2",
    type = "protected"
    }
    }

    Point to buckets in the workflow configuration

    Buckets specified in the buckets input variable to the cumulus module will be available in the meta object of the Cumulus message.

    To use the buckets specified in the configuration, you can do the following:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "provider_path": "{$.meta.provider_path}",
    "collection": "{$.meta.collection}",
    "buckets": "{$.meta.buckets}"
    }
    }
    }
    }
    }

    Or, to map a specific bucket to a config value for a task:

    {
    "MoveGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "bucket": "{$.meta.buckets.internal.name}",
    "buckets": "{$.meta.buckets}"
    }
    }
    }
    }
    }

    Hardcode a bucket

    Bucket names can be hardcoded in your workflow configuration, for example:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "provider_path": "{$.meta.provider_path}",
    "collection": "{$.meta.collection}",
    "buckets": {
    "internal": "sample-internal-bucket",
    "protected": "sample-protected-bucket-2"
    }
    }
    }
    }
    }
    }

    Or you can do a combination of meta buckets and hardcoded:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "provider_path": "{$.meta.provider_path}",
    "collection": "{$.meta.collection}",
    "buckets": {
    "internal": "sample-internal-bucket",
    "private": "{$.meta.buckets.private.name}"
    }
    }
    }
    }
    }
    }

    Using meta and hardcoding

    Bucket names can be configured using a mixture of hardcoded values and values from the meta. For example, to configure the bucket based on the collection name you could do something like:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "provider_path": "{$.meta.provider_path}",
    "collection": "{$.meta.collection}",
    "buckets": {
    "internal": "{$.meta.collection.name}-bucket"
    }
    }
    }
    }
    }
    }

    How to specify a file location in a bucket

    Granule files can be placed in folders and subfolders in buckets for better organization. This is done by setting a url_path in the base level of a collection configuration to be applied to all files. To only affect placement of a single file, the url_path variable can be placed in that specific file of the collection configuration. There are a number of different ways to populate url_path.

    Hardcoding file placement

    A file path can be added as the url_path in the collection configuration to specify the final location of the files. For example, take the following collection configuration

    {
    "name": "MOD09GQ",
    "version": "006",
    "url_path": "example-path",
    "files": [
    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "file-example-path"
    },
    {
    "bucket": "private",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    }
    ]
    }

    The first file, MOD09GQ.A2017025.h21v00.006.2017034065104.hdf has its own url_path so the resulting file path might look like s3://sample-protected-bucket/file-example-path/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf. The second file, MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met, does not have it's own url_path so it will use the collection url_path and have a final file path of s3://sample-private-bucket/example-path/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met.

    Using a template for file placement

    Instead of hardcoding the placement, the url_path can be a template to be populated with metadata during the move-granules step. For example:

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}"

    This url path with be assigned as the collection shortname, "MOD09GQ". To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.name, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/v9.0.0/workflows/workflow-triggers/index.html b/docs/v9.0.0/workflows/workflow-triggers/index.html index bedf1b7b8f0..b41e8f4dd6c 100644 --- a/docs/v9.0.0/workflows/workflow-triggers/index.html +++ b/docs/v9.0.0/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v9.0.0

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/v9.9.0/adding-a-task/index.html b/docs/v9.9.0/adding-a-task/index.html index e679a21216c..7455e4a589f 100644 --- a/docs/v9.9.0/adding-a-task/index.html +++ b/docs/v9.9.0/adding-a-task/index.html @@ -5,13 +5,13 @@ Contributing a Task | Cumulus Documentation - +
    Version: v9.9.0

    Contributing a Task

    We're tracking reusable Cumulus tasks in this list and, if you've got one you'd like to share with others, you can add it!

    Right now we're focused on tasks distributed via npm, but are open to including others. For now the script that pulls all the data for each package only supports npm.

    The tasks.md file is generated in the build process

    The tasks list in docs/tasks.md is generated from the list of task package names from the tasks folder.

    Do not edit the docs/tasks.md file directly.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/api/index.html b/docs/v9.9.0/api/index.html index 98e7df1522d..e998a332606 100644 --- a/docs/v9.9.0/api/index.html +++ b/docs/v9.9.0/api/index.html @@ -5,13 +5,13 @@ Cumulus API | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v9.9.0/architecture/index.html b/docs/v9.9.0/architecture/index.html index 3c2ad9bee83..4cb8bcafcf2 100644 --- a/docs/v9.9.0/architecture/index.html +++ b/docs/v9.9.0/architecture/index.html @@ -5,14 +5,14 @@ Architecture | Cumulus Documentation - +
    Version: v9.9.0

    Architecture

    Architecture

    Below, find a diagram with the components that comprise an instance of Cumulus.

    Architecture diagram of a Cumulus deployment

    This diagram details all of the major architectural components of a Cumulus deployment.

    While the diagram can feel complex, it can easily be digested in several major components:

    Data Distribution

    End Users can access data via Cumulus's distribution submodule, which includes ASF's thin egress application, this provides authenticated data egress, temporary S3 links and other statistics features.

    End user exposure of Cumulus's holdings is expected to be provided by an external service.

    For NASA use, this is assumed to be CMR in this diagram.

    Data ingest

    Workflows

    The core of the ingest and processing capabilities in Cumulus is built into the deployed AWS Step Function workflows. Cumulus rules trigger workflows via either Cloud Watch rules, Kinesis streams, SNS topic, or SQS queue. The workflows then run with a configured Cumulus message, utilizing built-in processes to report status of granules, PDRs, executions, etc to the Data Persistence components.

    Workflows can optionally report granule metadata to CMR, and workflow steps can report metrics information to a shared SNS topic, which could be subscribed to for near real time granule, execution, and PDR status. This could be used for metrics reporting using an external ELK stack, for example.

    Data persistence

    Cumulus entity state data is stored in a set of DynamoDB database tables, and is exported to an ElasticSearch instance for non-authoritative querying/state data for the API and other applications that require more complex queries.

    Data discovery

    Discovering data for ingest is handled via workflow step components using Cumulus provider and collection configurations and various triggers. Data can be ingested from AWS S3, FTP, HTTPS and more.

    Database

    Cumulus utilizes a user-provided PostgreSQL database backend. For improved API search query efficiency Cumulus provides data replication to an Elasticsearch instance. For legacy reasons, Cumulus is currently also deploying a DynamoDB datastore, and writes are replicated in parallel with the PostgreSQL database writes. The DynamoDB replicated tables and parallel writes will be removed in future releases.

    PostgreSQL Database Schema Diagram

    ERD of the Cumulus Database

    Maintenance

    System maintenance personnel have access to manage ingest and various portions of Cumulus via an AWS API gateway, as well as the operator dashboard.

    Deployment Structure

    Cumulus is deployed via Terraform and is organized internally into two separate top-level modules, as well as several external modules.

    Cumulus

    The Cumulus module, which contains multiple internal submodules, deploys all of the Cumulus components that are not part of the Data Persistence portion of this diagram.

    Data persistence

    The data persistence module provides the Data Persistence portion of the diagram.

    Other modules

    Other modules are provided as artifacts on the release page for use in users configuring their own deployment and contain extracted subcomponents of the cumulus module. For more on these components see the components documentation.

    For more on the specific structure, examples of use and how to deploy and more, please see the deployment docs as well as the cumulus-template-deploy repo .

    - + \ No newline at end of file diff --git a/docs/v9.9.0/configuration/cloudwatch-retention/index.html b/docs/v9.9.0/configuration/cloudwatch-retention/index.html index 4817ebbb675..dd327bf016c 100644 --- a/docs/v9.9.0/configuration/cloudwatch-retention/index.html +++ b/docs/v9.9.0/configuration/cloudwatch-retention/index.html @@ -5,13 +5,13 @@ Cloudwatch Retention | Cumulus Documentation - +
    Version: v9.9.0

    Cloudwatch Retention

    Our lambdas dump logs to AWS CloudWatch. By default, these logs exist indefinitely. However, there are ways to specify a duration for log retention.

    aws-cli

    In addition to getting your aws-cli set-up, there are two values you'll need to acquire.

    1. log-group-name: the name of the log group who's retention policy (retention time) you'd like to change. We'll use /aws/lambda/KinesisInboundLogger in our examples.
    2. retention-in-days: the number of days you'd like to retain the logs in the specified log group for. There is a list of possible values available in the aws logs documentation.

    For example, if we wanted to set log retention to 30 days on our KinesisInboundLogger lambda, we would write:

    aws logs put-retention-policy --log-group-name "/aws/lambda/KinesisInboundLogger" --retention-in-days 30

    Note: The aws-cli log command that we're using is explained in detail here.

    AWS Management Console

    Changing the log retention policy in the AWS Management Console is a fairly simple process:

    1. Navigate to the CloudWatch service in the AWS Management Console.
    2. Click on the Logs entry on the sidebar.
    3. Find the Log Group who's retention policy you're interested in changing.
    4. Click on the value in the Expire Events After column.
    5. Enter/Select the number of days you'd like to retain logs in that log group for.

    Screenshot of AWS console showing how to configure the retention period for Cloudwatch logs

    - + \ No newline at end of file diff --git a/docs/v9.9.0/configuration/collection-storage-best-practices/index.html b/docs/v9.9.0/configuration/collection-storage-best-practices/index.html index 3536e1b3f05..a5c9f2182c6 100644 --- a/docs/v9.9.0/configuration/collection-storage-best-practices/index.html +++ b/docs/v9.9.0/configuration/collection-storage-best-practices/index.html @@ -5,13 +5,13 @@ Collection Cost Tracking and Storage Best Practices | Cumulus Documentation - +
    Version: v9.9.0

    Collection Cost Tracking and Storage Best Practices

    Organizing your data is important for metrics you may want to collect. AWS S3 storage and cost metrics are calculated at the bucket level, so it is easy to get metrics by bucket. You can get storage metrics at the key prefix level, but that is done through the CLI, which can be very slow for large buckets. It is very difficult to estimate costs at the prefix level.

    Calculating Storage By Collection

    By bucket

    Usage by bucket can be obtained in your AWS Billing Dashboard via an S3 Usage Report. You can download your usage report for a period of time and review your storage and requests at the bucket level.

    Bucket metrics can also be found in the AWS CloudWatch Metrics Console (also see Using Amazon CloudWatch Metrics).

    Navigate to Storage Metrics and select the BucketName for all buckets you are interested in. The available metrics are BucketSizeInBytes and NumberOfObjects.

    In the Graphed metrics tab, you can select the type of statistic (i.e. average, minimum, maximum) and the period for the stats. At the top, it's useful to select from the dropdown to view the metrics as a number. You can also select the time period for which you want to see stats.

    Alternatively you can query CloudWatch using the CLI.

    This command will return the average number of bytes in the bucket test-bucket for 7/31/2019:

    aws cloudwatch get-metric-statistics --namespace AWS/S3 --start-time 2019-07-31T00:00:00 --end-time 2019-08-01T00:00:00 --period 86400 --statistics Average --region us-east-1 --metric-name BucketSizeBytes --dimensions Name=BucketName,Value=test-bucket Name=StorageType,Value=StandardStorage

    The result looks like:

    {
    "Datapoints": [
    {
    "Timestamp": "2019-07-31T00:00:00Z",
    "Average": 150996467959.0,
    "Unit": "Bytes"
    }
    ],
    "Label": "BucketSizeBytes"
    }

    By key prefix

    AWS does not offer storage and usage statistics at a key prefix level. Via the AWS CLI, you can get the total storage for a bucket or folder. The following command would get the storage for folder example-folder in bucket sample-bucket:

    aws s3 ls --summarize --human-readable --recursive s3://sample-bucket/example-folder | grep 'Total'

    Note that this can be a long-running operation for large buckets.

    Calculating Cost By Collection

    NASA NGAP Environment

    If using an NGAP account, the cost per bucket can be found in your CloudTamer console, in the Financials section of your account information. This is calculated on a monthly basis.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Outside of NGAP

    You can enabled S3 Cost Allocation Tags and tag your buckets. From there, you can view the cost breakdown in your AWS Billing Dashboard via the Cost Explorer. Cost Allocation Tagging is available at the bucket level.

    There is no easy way to get the cost by folder in the buckets. You could calculate an estimate using the storage per prefix vs. the storage of the bucket.

    Storage Configuration

    Cumulus allows for the configuration of many buckets for your files. Buckets are created and added to your deployment as part of the deployment process.

    In your Cumulus collection configuration, you specify where you want the files to be stored post-processing. This is done by matching a regular expression on the file with the configured bucket.

    Note that in the collection configuration, the bucket field is the key to the buckets variable in the deployment's .tfvars file.

    Organizing By Bucket

    You can specify separate groups of buckets for each collection, which could look like the example below.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "MOD09GQ-006-private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "MOD09GQ-006-protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "MOD09GQ-006-public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    Additional collections would go to different buckets.

    Organizing by Key Prefix

    Different collections can be organized into different folders in the same bucket, using the key prefix, which is specified as the url_path in the collection configuration. In this simplified collection configuration example, the url_path field is set at the top level so that all files go to a path prefixed with the collection name and version.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met"
    },
    {
    "bucket": "protected",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg"
    }
    ]
    }

    In this case, the path to all the files would be: MOD09GQ___006/<filename> in their respective buckets.

    The url_path can be overidden directly on the file configuration. The example below produces the same result.

    {
    "name": "MOD09GQ",
    "version": "006",
    "granuleId": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^.*\\.hdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "private",
    "regex": "^.*\\.hdf\\.met$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.met",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "protected-2",
    "regex": "^.*\\.cmr\\.xml$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    },
    {
    "bucket": "public",
    "regex": "^*\\.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_ndvi.jpg",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}"
    }
    ]
    }
    - + \ No newline at end of file diff --git a/docs/v9.9.0/configuration/data-management-types/index.html b/docs/v9.9.0/configuration/data-management-types/index.html index 185c2d1c141..cf298ba23d5 100644 --- a/docs/v9.9.0/configuration/data-management-types/index.html +++ b/docs/v9.9.0/configuration/data-management-types/index.html @@ -5,13 +5,13 @@ Cumulus Data Management Types | Cumulus Documentation - +
    Version: v9.9.0

    Cumulus Data Management Types

    What Are The Cumulus Data Management Types

    • Collections: Collections are logical sets of data objects of the same data type and version. They provide contextual information used by Cumulus ingest.
    • Granules: Granules are the smallest aggregation of data that can be independently managed. They are always associated with a collection, which is a grouping of granules.
    • Providers: Providers generate and distribute input data that Cumulus obtains and sends to workflows.
    • Rules: Rules tell Cumulus how to associate providers and collections and when/how to start processing a workflow.
    • Workflows: Workflows are composed of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage, and archive data.
    • Executions: Executions are records of a workflow.
    • Reconciliation Reports: Reports are a comparison of data sets to check to see if they are in agreement and to help Cumulus users detect conflicts.

    Interaction

    • Providers tell Cumulus where to get new data - i.e. S3, HTTPS
    • Collections tell Cumulus where to store the data files
    • Rules tell Cumulus when to trigger a workflow execution and tie providers and collections together

    Managing Data Management Types

    The following are created via the dashboard or API:

    • Providers
    • Collections
    • Rules
    • Reconciliation reports

    Granules are created by workflow executions and then can be managed via the dashboard or API.

    An execution record is created for each workflow execution triggered and can be viewed in the dashboard or data can be retrieved via the API.

    Workflows are created and managed via the Cumulus deployment.

    Configuration Fields

    Schemas

    Looking at our API schema definitions can provide us with some insight into collections, providers, rules, and their attributes (and whether those are required or not). The schema for different concepts will be reference throughout this document.

    The schemas are extremely useful for understanding which attributes are configurable and which of those are required. Cumulus uses these schemas for validation.

    Providers

    Please note:

    • While connection configuration is defined here, things that are more specific to a specific ingest setup (e.g. 'What target directory should we be pulling from' or 'How is duplicate handling configured?') are generally defined in a Rule or Collection, not the Provider.
    • There is some provider behavior which is controlled by task-specific configuration and not the provider definition. This configuration has to be set on a per-workflow basis. For example, see the httpListTimeout configuration on the discover-granules task

    Provider Configuration

    The Provider configuration is defined by a JSON object that takes different configuration keys depending on the provider type. The following are definitions of typical configuration values relevant for the various providers:

    Configuration by provider type
    S3
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be s3 for this provider type.
    hoststringYesS3 Bucket to pull data from
    http
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be http for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 80
    https
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be https for this provider type
    hoststringYesThe host to pull data from (e.g. nasa.gov)
    usernamestringNoConfigured username for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    passwordstringOnly if username is specifiedConfigured password for basic authentication. Cumulus encrypts this using KMS and uses it in a Basic auth header if needed for authentication
    portintegerNoPort to connect to the provider on. Defaults to 443
    ftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be ftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to anonymous if not defined
    passwordstringNoPassword to use to connect to the ftp server. Cumulus encrypts this using KMS. Defaults to password if not defined
    portintegerNoPort to connect to the provider on. Defaults to 21
    sftp
    KeyTypeRequiredDescription
    idstringYesUnique identifier for the provider
    globalConnectionLimitintegerNoInteger specifying the connection limit for the provider. This is the maximum number of connections Cumulus compatible ingest lambdas are expected to make to a provider. Defaults to unlimited
    protocolstringYesThe protocol for this provider. Must be sftp for this provider type
    hoststringYesThe ftp host to pull data from (e.g. nasa.gov)
    usernamestringNoUsername to use to connect to the sftp server.
    passwordstringNoPassword to use to connect to the sftp server.
    portintegerNoPort to connect to the provider on. Defaults to 22

    Collections

    Break down of [s3_MOD09GQ_006.json](https://github.com/nasa/cumulus/blob/master/example/data/collections/s3_MOD09GQ_006/s3_MOD09GQ_006.json)
    KeyValueRequiredDescription
    name"MOD09GQ"YesThe name attribute designates the name of the collection. This is the name under which the collection will be displayed on the dashboard
    version"006"YesA version tag for the collection
    granuleId"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}$"YesThe regular expression used to validate the granule ID extracted from filenames according to the granuleIdExtraction
    granuleIdExtraction"(MOD09GQ\..*)(\.hdf|\.cmr|_ndvi\.jpg)"YesThe regular expression used to extract the granule ID from filenames. The first capturing group extracted from the filename by the regex will be used as the granule ID.
    sampleFileName"MOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesAn example filename belonging to this collection
    files<JSON Object> of files defined hereYesDescribe the individual files that will exist for each granule in this collection (size, browse, meta, etc.)
    dataType"MOD09GQ"NoCan be specified, but this value will default to the collection_name if not
    duplicateHandling"replace"No("replace"|"version"|"skip") determines granule duplicate handling scheme
    ignoreFilesConfigForDiscoveryfalse (default)NoBy default, during discovery only files that match one of the regular expressions in this collection's files attribute (see above) are ingested. Setting this to true will ignore the files attribute during discovery, meaning that all files for a granule (i.e., all files with filenames matching granuleIdExtraction) will be ingested even when they don't match a regular expression in the files attribute at discovery time. (NOTE: this attribute does not appear in the example file, but is listed here for completeness.)
    process"modis"NoExample options for this are found in the ChooseProcess step definition in the IngestAndPublish workflow definition
    meta<JSON Object> of MetaData for the collectionNoMetaData for the collection. This metadata will be available to workflows for this collection via the Cumulus Message Adapter.
    url_path"{cmrMetadata.Granule.Collection.ShortName}/
    {substring(file.name, 0, 3)}"
    NoFilename without extension

    files-object

    KeyValueRequiredDescription
    regex"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"YesRegular expression used to identify the file
    sampleFileNameMOD09GQ.A2017025.h21v00.006.2017034065104.hdf"YesFilename used to validate the provided regex
    type"data"NoValue to be assigned to the Granule File Type. CNM types are used by Cumulus CMR steps, non-CNM values will be treated as 'data' type. Currently only utilized in DiscoverGranules task
    bucket"internal"YesName of the bucket where the file will be stored
    url_path"${collectionShortName}/{substring(file.name, 0, 3)}"NoFolder used to save the granule in the bucket. Defaults to the collection url_path
    checksumFor"^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\.hdf$"NoIf this is a checksum file, set checksumFor to the regex of the target file.

    Rules

    Rules are used by to start processing workflows and the transformation process. Rules can be invoked manually, based on a schedule, or can be configured to be triggered by either events in Kinesis, SNS messages or SQS messages.

    Rule configuration
    KeyValueRequiredDescription
    name"L2_HR_PIXC_kinesisRule"YesName of the rule. This is the name under which the rule will be listed on the dashboard
    workflow"CNMExampleWorkflow"YesName of the workflow to be run. A list of available workflows can be found on the Workflows page
    provider"PODAAC_SWOT"NoConfigured provider's ID. This can be found on the Providers dashboard page
    collection<JSON Object> collection object shown belowYesName and version of the collection this rule will moderate. Relates to a collection configured and found in the Collections page
    payload<JSON Object or Array>NoThe payload to be passed to the workflow
    meta<JSON Object> of MetaData for the ruleNoMetaData for the rule. This metadata will be available to workflows for this rule via the Cumulus Message Adapter.
    rule<JSON Object> rule type and associated values - discussed belowYesObject defining the type and subsequent attributes of the rule
    state"ENABLED"No("ENABLED"|"DISABLED") whether or not the rule will be active. Defaults to "ENABLED".
    queueUrlhttps://sqs.us-east-1.amazonaws.com/1234567890/queue-nameNoURL for SQS queue that will be used to schedule workflows for this rule
    tags["kinesis", "podaac"]NoAn array of strings that can be used to simplify search

    collection-object

    KeyValueRequiredDescription
    name"L2_HR_PIXC"YesName of a collection defined/configured in the Collections dashboard page
    version"000"YesVersion number of a collection defined/configured in the Collections dashboard page

    meta-object

    KeyValueRequiredDescription
    retries3NoNumber of retries on errors, for sqs-type rule only. Defaults to 3.
    visibilityTimeout900NoVisibilityTimeout in seconds for the inflight messages, for sqs-type rule only. Defaults to the visibility timeout of the SQS queue when the rule is created.

    rule-object

    KeyValueRequiredDescription
    type"kinesis"Yes("onetime"|"scheduled"|"kinesis"|"sns"|"sqs") type of scheduling/workflow kick-off desired
    value<String> ObjectDependsDiscussion of valid values is below

    rule-value

    The rule - value entry depends on the type of run:

    • If this is a onetime rule this can be left blank. Example
    • If this is a scheduled rule this field must hold a valid cron-type expression or rate expression.
    • If this is a kinesis rule, this must be a configured ${Kinesis_stream_ARN}. Example
    • If this is an sns rule, this must be an existing ${SNS_Topic_Arn}. Example
    • If this is an sqs rule, this must be an existing ${SQS_QueueUrl} that your account has permissions to access, and also you must configure a dead-letter queue for this SQS queue. Example

    sqs-type rule features

    • When an SQS rule is triggered, the SQS message remains on the queue.
    • The SQS message is not processed multiple times in parallel when visibility timeout is properly set. You should set the visibility timeout to the maximum expected length of the workflow with padding. Longer is better to avoid parallel processing.
    • The SQS message visibility timeout can be overridden by the rule.
    • Upon successful workflow execution, the SQS message is removed from the queue.
    • Upon failed execution(s), the workflow is run 3 or configured number of times.
    • Upon failed execution(s), the visibility timeout will be set to 5s to allow retries.
    • After configured number of failed retries, the SQS message is moved to the dead-letter queue configured for the SQS queue.

    Configuration Via Cumulus Dashboard

    Create A Provider

    • In the Cumulus dashboard, go to the Provider page.

    Screenshot of Create Provider form

    • Click on Add Provider.
    • Fill in the form and then submit it.

    Screenshot of Create Provider form

    Create A Collection

    • Go to the Collections page.

    Screenshot of the Collections page

    • Click on Add Collection.
    • Copy and paste or fill in the collection JSON object form.

    Screenshot of Add Collection form

    • Once you submit the form, you should be able to verify that your new collection is in the list.

    Create A Rule

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Rule Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v9.9.0/configuration/lifecycle-policies/index.html b/docs/v9.9.0/configuration/lifecycle-policies/index.html index 162e965f005..5eb3b39a859 100644 --- a/docs/v9.9.0/configuration/lifecycle-policies/index.html +++ b/docs/v9.9.0/configuration/lifecycle-policies/index.html @@ -5,13 +5,13 @@ Setting S3 Lifecycle Policies | Cumulus Documentation - +
    Version: v9.9.0

    Setting S3 Lifecycle Policies

    This document will outline, in brief, how to set data lifecycle policies so that you are more easily able to control data storage costs while keeping your data accessible. For more information on why you might want to do this, see the 'Additional Information' section at the end of the document.

    Requirements

    • The AWS CLI installed and configured (if you wish to run the CLI example). See AWS's guide to setting up the AWS CLI for more on this. Please ensure the AWS CLI is in your shell path.
    • You will need a S3 bucket on AWS. You are strongly encouraged to use a bucket without voluminous amounts of data in it for experimenting/learning.
    • An AWS user with the appropriate roles to access the target bucket as well as modify bucket policies.

    Examples

    Walkthrough on setting time-based S3 Infrequent Access (S3IA) bucket policy

    This example will give step-by-step instructions on updating a bucket's lifecycle policy to move all objects in the bucket from the default storage to S3 Infrequent Access (S3IA) after a period of 90 days. Below are instructions for walking through configuration via the command line and the management console.

    Command Line

    Please ensure you have the AWS CLI installed and configured for access prior to attempting this example.

    Create policy

    From any directory you chose, open an editor and add the following to a file named exampleRule.json

    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    Set policy

    On the command line run the following command (with the bucket you're working with substituted in place of yourBucketNameHere).

    aws s3api put-bucket-lifecycle-configuration --bucket yourBucketNameHere --lifecycle-configuration file://exampleRule.json

    Verify policy has been set

    To obtain all of the existing policies for a bucket, run the following command (again substituting the correct bucket name):

     $ aws s3api get-bucket-lifecycle-configuration --bucket yourBucketNameHere
    {
    "Rules": [
    {
    "Status": "Enabled",
    "Filter": {
    "Prefix": ""
    },
    "Transitions": [
    {
    "Days": 90,
    "StorageClass": "STANDARD_IA"
    }
    ],
    "NoncurrentVersionTransitions": [
    {
    "NoncurrentDays": 90,
    "StorageClass": "STANDARD_IA"
    }
    ]
    "ID": "90DayS3IAExample"
    }
    ]
    }

    You have set a policy that transitions any version of an object in the bucket to S3IA after each object version has not been modified for 90 days.

    Management Console

    Create Policy

    To create the example policy on a bucket via the management console, go to the following URL (replacing 'yourBucketHere' with the bucket you intend to update):

    https://s3.console.aws.amazon.com/s3/buckets/yourBucketHere/?tab=overview

    You should see a screen similar to:

    Screenshot of AWS console for an S3 bucket

    Click the "Management" Tab, then lifecycle button and press + Add lifecycle rule:

    Screenshot of &quot;Management&quot; tab of AWS console for an S3 bucket

    Give the rule a name (e.g. '90DayRule'), leaving the filter blank:

    Screenshot of window for configuring the name and scope of a lifecycle rule on an S3 bucket in the AWS console

    Click next, and mark Current Version and Previous Versions.

    Then for each, click + Add transition and select Transition to Standard-IA after for the Object creation field, and set 90 for the Days after creation/Days after objects become concurrent field. Your screen should look similar to:

    Screenshot of window for configuring the storage class transitions of a lifecycle rule on an S3 bucket in the AWS console

    Click next, then next past the Configure expiration screen (we won't be setting this), and on the fourth page, click Save:

    Screenshot of window for reviewing the configuration of a lifecycle rule on an S3 bucket in the AWS console

    You should now see you have a rule configured for your bucket:

    Screenshot of lifecycle rule appearing in the &quot;Management&quot; tab of AWS console for an S3 bucket

    You have now set a policy that transitions any version of an object in the bucket to S3IA after each object has not been modified for 90 days.

    Additional Information

    This section lists information you may want prior to enacting lifecycle policies. It is not required content for working through the examples.

    Strategy Overview

    For a discussion of overall recommended strategy, please review the Methodology for Data Lifecycle Management on the EarthData wiki.

    AWS Documentation

    The examples shown in this document are obviously fairly basic cases. By using object tags, filters and other configuration options you can enact far more complicated policies for various scenarios. For more reading on the topics presented on this page see:

    - + \ No newline at end of file diff --git a/docs/v9.9.0/configuration/monitoring-readme/index.html b/docs/v9.9.0/configuration/monitoring-readme/index.html index 08800220064..1e83094d014 100644 --- a/docs/v9.9.0/configuration/monitoring-readme/index.html +++ b/docs/v9.9.0/configuration/monitoring-readme/index.html @@ -5,14 +5,14 @@ Monitoring Best Practices | Cumulus Documentation - +
    Version: v9.9.0

    Monitoring Best Practices

    This document intends to provide a set of recommendations and best practices for monitoring the state of a deployed Cumulus and diagnosing any issues.

    Cumulus-provided resources and integrations for monitoring

    Cumulus provides a number set of resources that are useful for monitoring the system and its operation.

    Cumulus Dashboard

    The primary tool for monitoring the Cumulus system is the Cumulus Dashboard. The dashboard is hosted on Github and includes instructions on how to deploy and link it into your core Cumulus deployment.

    The dashboard displays workflow executions, their status, inputs, outputs, and some diagnostic information such as logs. For further information on the dashboard, its usage, and the information it provides, see the documentation.

    Cumulus-provided AWS resources

    Cumulus sets up CloudWatch log groups for all Core-provided tasks.

    Monitoring Lambda Functions

    Logging for each Lambda Function is available in Lambda-specific CloudWatch log groups.

    Monitoring ECS services

    Each deployed cumulus_ecs_service module also includes a CloudWatch log group for the processes running on ECS.

    Monitoring workflows

    For advanced debugging, we also configure dead letter queues on critical system functions. These will allow you to monitor and debug invalid inputs to the functions we use to start workflows, which can be helpful if you find that you are not seeing workflows being started as expected. More information on these can be found in the dead letter queue documentation

    AWS recommendations

    AWS has a number of recommendations on system monitoring. Rather than reproduce those here and risk providing outdated guidance, we've documented the following links which will take you to available AWS docs on monitoring recommendations and best practices for the services used in Cumulus:

    Example: Setting up email notifications for CloudWatch logs

    Cumulus does not provide out-of-the-box support for email notifications at this time. However, setting up email notifications on AWS is fairly straightforward in that the operative components are an AWS SNS topic and a subscribed email address.

    In terms of Cumulus integration, forwarding CloudWatch logs requires creating a mechanism, most likely a Lambda Function subscribed to the log group that will receive, filter and forward these messages to the SNS topic.

    As a very simple example, we could create a function that filters CloudWatch logs created by the @cumulus/logger package and sends email notifications for error and fatal log levels, adapting the example linked above:

    const zlib = require('zlib');
    const aws = require('aws-sdk');
    const { promisify } = require('util');

    const gunzip = promisify(zlib.gunzip);
    const sns = new aws.SNS();

    exports.handler = async (event) => {
    const payload = Buffer.from(event.awslogs.data, 'base64');
    const decompressedData = await gunzip(payload);
    const logData = JSON.parse(decompressedData.toString('ascii'));
    return await Promise.all(logData.logEvents.map(async (logEvent) => {
    const logMessage = JSON.parse(logEvent.message);
    if (['error', 'fatal'].includes(logMessage.level)) {
    return sns.publish({
    TopicArn: process.env.EmailReportingTopicArn,
    Message: logEvent.message
    }).promise();
    }
    return Promise.resolve();
    }));
    };

    After creating the SNS topic, We can deploy this code as a lambda function, following the setup steps from Amazon. Make sure to include your SNS topic ARN as an environment variable on the lambda function by using the --environment option on aws lambda create-function.

    You will need to create subscription filters for each log group you want to receive emails for. We recommend automating this as much as possible, and you could very well handle this via Terraform, such as using a module to deploy filters alongside log groups, or exporting the log group names to an all-in-one email notification module.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/configuration/server_access_logging/index.html b/docs/v9.9.0/configuration/server_access_logging/index.html index c9cd2239b84..e1b1831d043 100644 --- a/docs/v9.9.0/configuration/server_access_logging/index.html +++ b/docs/v9.9.0/configuration/server_access_logging/index.html @@ -5,13 +5,13 @@ S3 Server Access Logging | Cumulus Documentation - +
    Version: v9.9.0

    S3 Server Access Logging

    Via AWS Console

    Enable server access logging for an S3 bucket

    Via AWS Command Line Interface

    1. Create a logging.json file with these contents, replacing <stack-internal-bucket> with your stack's internal bucket name, and <stack> with the name of your cumulus stack.

      {
      "LoggingEnabled": {
      "TargetBucket": "<stack-internal-bucket>",
      "TargetPrefix": "<stack>/ems-distribution/s3-server-access-logs/"
      }
      }
    2. Add the logging policy to each of your protected and public buckets by calling this command on each bucket.

      aws s3api put-bucket-logging --bucket <protected/public-bucket-name> --bucket-logging-status file://logging.json
    3. Verify the logging policy exists on your buckets.

      aws s3api get-bucket-logging --bucket <protected/public-bucket-name>
    - + \ No newline at end of file diff --git a/docs/v9.9.0/configuration/task-configuration/index.html b/docs/v9.9.0/configuration/task-configuration/index.html index 02016e4f1a8..71abdc523f3 100644 --- a/docs/v9.9.0/configuration/task-configuration/index.html +++ b/docs/v9.9.0/configuration/task-configuration/index.html @@ -5,13 +5,13 @@ Configuration of Tasks | Cumulus Documentation - +
    Version: v9.9.0

    Configuration of Tasks

    The cumulus module exposes values for configuration for some of the provided archive and ingest tasks. Currently the following are available as configurable variables:

    elasticsearch_client_config

    Configuration parameters for Elasticsearch client for cumulus archive module tasks in the form:

    <lambda_identifier>_es_scroll_duration = <duration>
    <lambda_identifier>_es_scroll_size = <size>
    type = map(string)

    Currently the following values are supported:

    • create_reconciliation_report_es_scroll_duration
    • create_reconciliation_report_es_scroll_size

    Example

    elasticsearch_client_config = {
    create_reconciliation_report_es_scroll_duration = "15m"
    create_reconciliation_report_es_scroll_size = 2000
    }

    lambda_timeouts

    A configurable map of timeouts (in seconds) for cumulus ingest module task lambdas in the form:

    <lambda_identifier>_timeout: <timeout>
    type = map(string)

    Currently the following values are supported:

    • discover_granules_task_timeout
    • discover_pdrs_task_timeout
    • hyrax_metadata_update_tasks_timeout
    • lzards_backup_task_timeout
    • move_granules_task_timeout
    • parse_pdr_task_timeout
    • pdr_status_check_task_timeout
    • post_to_cmr_task_timeout
    • queue_granules_task_timeout
    • queue_pdrs_task_timeout
    • queue_workflow_task_timeout
    • sync_granule_task_timeout
    • update_granules_cmr_metadata_file_links_task_timeout

    Example

    lambda_timeouts = {
    discover_granules_task_timeout = 300
    }
    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/about-cookbooks/index.html b/docs/v9.9.0/data-cookbooks/about-cookbooks/index.html index c0aa89c3615..df8bd235040 100644 --- a/docs/v9.9.0/data-cookbooks/about-cookbooks/index.html +++ b/docs/v9.9.0/data-cookbooks/about-cookbooks/index.html @@ -5,13 +5,13 @@ About Cookbooks | Cumulus Documentation - +
    Version: v9.9.0

    About Cookbooks

    Introduction

    The following data cookbooks are documents containing examples and explanations of workflows in the Cumulus framework. Additionally, the following data cookbooks should serve to help unify an institution/user group on a set of terms.

    Setup

    The data cookbooks assume you can configure providers, collections, and rules to run workflows. Visit Cumulus data management types for information on how to conifgure Cumulus data management types.

    Adding a page

    As shown in detail in the "Add a New Page and Sidebars" section in Cumulus Docs: How To's, you can add a new page to the data cookbook by creating a markdown (.md) file in the docs/data-cookbooks directory. The new page can then be linked to the sidebar by adding it to the Data-Cookbooks object in the website/sidebar.json file as data-cookbooks/${id}.

    More about workflows

    Workflow general information

    Input & Output

    Developing Workflow Tasks

    Workflow Configuration How-to's

    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/browse-generation/index.html b/docs/v9.9.0/data-cookbooks/browse-generation/index.html index afe96d495fb..1de234d6272 100644 --- a/docs/v9.9.0/data-cookbooks/browse-generation/index.html +++ b/docs/v9.9.0/data-cookbooks/browse-generation/index.html @@ -5,7 +5,7 @@ Ingest Browse Generation | Cumulus Documentation - + @@ -15,7 +15,7 @@ provider keys with the previously entered values) Note that you need to set the "provider_path" to the path on your bucket (e.g. "/data") that you've staged your mock/test data.:

    {
    "name": "TestBrowseGeneration",
    "workflow": "DiscoverGranulesBrowseExample",
    "provider": "{{provider_from_previous_step}}",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "meta": {
    "provider_path": "{{path_to_data}}"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "updatedAt": 1553053438767
    }

    Run Workflows

    Once you've configured the Collection and Provider and added a onetime rule, you're ready to trigger your rule, and watch the ingest workflows process.

    Go to the Rules tab, click the rule you just created:

    Screenshot of the Rules overview page with a list of rules in the Cumulus dashboard

    Then click the gear in the upper right corner and click "Rerun":

    Screenshot of clicking the button to rerun a workflow rule from the rule edit page in the Cumulus dashboard

    Tab over to executions and you should see the DiscoverGranulesBrowseExample workflow run, succeed, and then moments later the CookbookBrowseExample should run and succeed.

    Screenshot of page listing executions in the Cumulus dashboard

    Results

    You can verify your data has ingested by clicking the successful workflow entry:

    Screenshot of individual entry from table listing executions in the Cumulus dashboard

    Select "Show Output" on the next page

    Screenshot of &quot;Show output&quot; button from individual execution page in the Cumulus dashboard

    and you should see in the payload from the workflow something similar to:

    "payload": {
    "process": "modis",
    "granules": [
    {
    "files": [
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "filepath": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-protected",
    "filename": "s3://cumulus-test-sandbox-protected/MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "time": 1553027415000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    "size": 1908635
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "filepath": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-private",
    "filename": "s3://cumulus-test-sandbox-private/MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "time": 1553027412000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    "size": 21708
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "filepath": "MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-protected",
    "filename": "s3://cumulus-test-sandbox-protected/MOD09GQ___006/2017/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "time": 1553027415000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    "size": 1908635
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "filepath": "MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-protected-2",
    "filename": "s3://cumulus-test-sandbox-protected-2/MOD09GQ___006/MOD/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.name, 0, 3)}"
    }
    ],
    "cmrLink": "https://cmr.uat.earthdata.nasa.gov/search/granules.json?concept_id=G1222231611-CUMULUS",
    "cmrConceptId": "G1222231611-CUMULUS",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "cmrMetadataFormat": "echo10",
    "dataType": "MOD09GQ",
    "version": "006",
    "published": true
    }
    ]
    }

    You can verify the granules exist within your cumulus instance (search using the Granules interface, check the S3 buckets, etc) and validate that the above CMR entry


    Build Processing Lambda

    This section discusses the construction of a custom processing lambda to replace the contrived example from this entry for a real dataset processing task.

    To ingest your own data using this example, you will need to construct your own lambda to replace the source in ProcessingStep that will generate browse imagery and provide or update a CMR metadata export file.

    You will then need to add the lambda to your Cumulus deployment as a aws_lambda_function Terraform resource.

    The discussion below outlines requirements for this lambda.

    Inputs

    The incoming message to the task defined in the ProcessingStep as configured will have the following configuration values (accessible inside event.config courtesy of the message adapter):

    Configuration

    • event.config.bucket -- the name of the bucket configured in terraform.tfvars as your internal bucket.

    • event.config.collection -- The full collection object we will configure in the Configure Ingest section. You can view the expected collection schema in the docs here or in the source code on github. You need this as available input and output so you can update as needed.

    event.config.additionalUrls, generateFakeBrowse and event.config.cmrMetadataFormat from the example can be ignored as they're configuration flags for the provided example script.

    Payload

    The 'payload' from the previous task is accessible via event.input. The expected payload output schema from SyncGranules can be viewed here.

    In our example, the payload would look like the following. Note: The types are set per-file based on what we configured in our collection, and were initially added as part of the DiscoverGranules step in the DiscoverGranulesBrowseExample workflow.

     "payload": {
    "process": "modis",
    "granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-internal",
    "filename": "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "fileStagingDir": "file-staging/jk2/MOD09GQ___006",
    "time": 1553027415000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.name, 0, 3)}",
    "size": 1908635
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-internal",
    "filename": "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "fileStagingDir": "file-staging/jk2/MOD09GQ___006",
    "time": 1553027412000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.name, 0, 3)}",
    "size": 21708
    }
    ]
    }
    ]
    }

    Generating Browse Imagery

    The provided example script used in the example goes through all granules and adds a 'fake' .jpg browse file to the same staging location as the data staged by prior ingest tasksf.

    The processing lambda you construct will need to do the following:

    • Create a browse image file based on the input data, and stage it to a location accessible to both this task and the FilesToGranules and MoveGranules tasks in a S3 bucket.
    • Add the browse file to the input granule files, making sure to set the granule file's type to browse.
    • Update meta.input_granules with the updated granules list, as well as provide the files to be integrated by FilesToGranules as output from the task.

    Generating/updating CMR metadata

    If you do not already have a CMR file in the granules list, you will need to generate one for valid export. This example's processing script generates and adds it to the FilesToGranules file list via the payload but it can be present in the InputGranules from the DiscoverGranules task as well if you'd prefer to pre-generate it.

    Both downstream tasks MoveGranules, UpdateGranulesCmrMetadataFileLinks, and PostToCmr expect a valid CMR file to be available if you want to export to CMR.

    Expected Outputs for processing task/tasks

    In the above example, the critical portion of the output to FilesToGranules is the payload and meta.input_granules.

    In the example provided, the processing task is setup to return an object with the keys "files" and "granules". In the cumulus_message configuration, the outputs are mapped in the configuration to the payload, granules to meta.input_granules:

              "task_config": {
    "inputGranules": "{$.meta.input_granules}",
    "granuleIdExtraction": "{$.meta.collection.granuleIdExtraction}"
    }

    Their expected values from the example above may be useful in constructing a processing task:

    payload

    The payload includes a full list of files to be 'moved' into the cumulus archive. The FilesToGranules task will take this list, merge it with the information from InputGranules, then pass that list to the MoveGranules task. The MoveGranules task will then move the files to their targets. The UpdateGranulesCmrMetadataFileLinks task will update the CMR metadata file if it exists with the updated granule locations and update the CMR file etags.

    In the provided example, a payload being passed to the FilesToGranules task should be expected to look like:

      "payload": [
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml"
    ]

    This list is the list of granules FilesToGranules will act upon to add/merge with the input_granules object.

    The pathing is generated from sync-granules, but in principle the files can be staged wherever you like so long as the processing/MoveGranules task's roles have access and the filename matches the collection configuration.

    input_granules

    The FilesToGranules task utilizes the incoming payload to chose which files to move, but pulls all other metadata from meta.input_granules. As such, the output payload in the example would look like:

    "input_granules": [
    {
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606",
    "dataType": "MOD09GQ",
    "version": "006",
    "files": [
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "bucket": "cumulus-test-sandbox-internal",
    "filename": "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "fileStagingDir": "file-staging/jk2/MOD09GQ___006",
    "time": 1553027415000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    "size": 1908635
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "type": "metadata",
    "bucket": "cumulus-test-sandbox-internal",
    "filename": "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf.met",
    "fileStagingDir": "file-staging/jk2/MOD09GQ___006",
    "time": 1553027412000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    "size": 21708
    },
    {
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "type": "browse",
    "bucket": "cumulus-test-sandbox-internal",
    "filename": "s3://cumulus-test-sandbox-internal/file-staging/jk2/MOD09GQ___006/MOD09GQ.A2016358.h13v04.006.2016360104606.jpg",
    "fileStagingDir": "file-staging/jk2/MOD09GQ___006",
    "time": 1553027415000,
    "path": "data",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}___{cmrMetadata.Granule.Collection.VersionId}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}/{substring(file.name, 0, 3)}",
    "duplicate_found": true,
    }
    ]
    }
    ],
    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/choice-states/index.html b/docs/v9.9.0/data-cookbooks/choice-states/index.html index 4b203f59815..627df1af621 100644 --- a/docs/v9.9.0/data-cookbooks/choice-states/index.html +++ b/docs/v9.9.0/data-cookbooks/choice-states/index.html @@ -5,13 +5,13 @@ Choice States | Cumulus Documentation - +
    Version: v9.9.0

    Choice States

    Cumulus supports AWS Step Function Choice states. A Choice state enables branching logic in Cumulus workflows.

    Choice state definitions include a list of Choice Rules. Each Choice Rule defines a logical operation which compares an input value against a value using a comparison operator. For available comparison operators, review the AWS docs.

    If the comparison evaluates to true, the Next state is followed.

    Example

    In examples/cumulus-tf/parse_pdr_workflow.tf the ParsePdr workflow uses a Choice state, CheckAgainChoice, to terminate the workflow once meta.isPdrFinished: true is returned by the CheckStatus state.

    The CheckAgainChoice state definition requires an input object of the following structure:

    {
    "meta": {
    "isPdrFinished": false
    }
    }

    Given the above input to the CheckAgainChoice state, the workflow would transition to the PdrStatusReport state.

    "CheckAgainChoice": {
    "Type": "Choice",
    "Choices": [
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": false,
    "Next": "PdrStatusReport"
    },
    {
    "Variable": "$.meta.isPdrFinished",
    "BooleanEquals": true,
    "Next": "WorkflowSucceeded"
    }
    ],
    "Default": "WorkflowSucceeded"
    }

    Advanced: Loops in Cumulus Workflows

    Understanding the complete ParsePdr workflow is not necessary to understanding how Choice states work, but ParsePdr provides an example of how Choice states can be used to create a loop in a Cumulus workflow.

    In the complete ParsePdr workflow definition, the state QueueGranules is followed by CheckStatus. From CheckStatus a loop starts: Given CheckStatus returns meta.isPdrFinished: false, CheckStatus is followed by CheckAgainChoice is followed by PdrStatusReport is followed by WaitForSomeTime, which returns to CheckStatus. Once CheckStatus returns meta.isPdrFinished: true, CheckAgainChoice proceeds to WorkflowSucceeded.

    Execution graph of SIPS ParsePdr workflow in AWS Step Functions console

    Further documentation

    For complete details on Choice state configuration options, see the Choice state documentation.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/cnm-workflow/index.html b/docs/v9.9.0/data-cookbooks/cnm-workflow/index.html index 04661fda9a8..2c3483e13a4 100644 --- a/docs/v9.9.0/data-cookbooks/cnm-workflow/index.html +++ b/docs/v9.9.0/data-cookbooks/cnm-workflow/index.html @@ -5,7 +5,7 @@ CNM Workflow | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.9.0

    CNM Workflow

    This entry documents how to setup a workflow that utilizes the built-in CNM/Kinesis functionality in Cumulus.

    Prior to working through this entry you should be familiar with the Cloud Notification Mechanism.

    Sections


    Prerequisites

    Cumulus

    This entry assumes you have a deployed instance of Cumulus (version >= 1.16.0). The entry assumes you are deploying Cumulus via the cumulus terraform module sourced from the release page.

    AWS CLI

    This entry assumes you have the AWS CLI installed and configured. If you do not, please take a moment to review the documentation - particularly the examples relevant to Kinesis - and install it now.

    Kinesis

    This entry assumes you already have two Kinesis data steams created for use as CNM notification and response data streams.

    If you do not have two streams setup, please take a moment to review the Kinesis documentation and setup two basic single-shard streams for this example:

    Using the "Create Data Stream" button on the Kinesis Dashboard, work through the dialogue.

    You should be able to quickly use the "Create Data Stream" button on the Kinesis Dashboard, and setup streams that are similar to the following example:

    Screenshot of AWS console page for creating a Kinesis stream

    Please bear in mind that your {{prefix}}-lambda-processing IAM role will need permissions to write to the response stream for this workflow to succeed if you create the Kinesis stream with a dashboard user. If you are using the cumulus top-level module for your deployment this should be set properly.

    If not, the most straightforward approach is to attach the AmazonKinesisFullAccess policy for the stream resource to whatever role your Lambda s are using, however your environment/security policies may require an approach specific to your deployment environment.

    In operational environments it's likely science data providers would typically be responsible for providing a Kinesis stream with the appropriate permissions.

    For more information on how this process works and how to develop a process that will add records to a stream, read the Kinesis documentation and the developer guide.

    Source Data

    This entry will run the SyncGranule task against a single target data file. To that end it will require a single data file to be present in an S3 bucket matching the Provider configured in the next section.

    Collection and Provider

    Cumulus will need to be configured with a Collection and Provider entry of your choosing. The provider should match the location of the source data from the Ingest Source Data section.

    This can be done via the Cumulus Dashboard if installed or the API. It is strongly recommended to use the dashboard if possible.


    Configure the Workflow

    Provided the prerequisites have been fulfilled, you can begin adding the needed values to your Cumulus configuration to configure the example workflow.

    The following are steps that are required to set up your Cumulus instance to run the example workflow:

    Example CNM Workflow

    In this example, we're going to trigger a workflow by creating a Kinesis rule and sending a record to a Kinesis stream.

    The following workflow definition should be added to a new .tf workflow resource (e.g. cnm_workflow.tf) in your deployment directory. For the complete CNM workflow example, see examples/cumulus-tf/kinesis_trigger_test_workflow.tf.

    Add the following to the new terraform file in your deployment directory, updating the following:

    • Set the response-endpoint key in the CnmResponse task in the workflow JSON to match the name of the Kinesis response stream you configured in the prerequisites section
    • Update the source key to the workflow module to match the Cumulus release associated with your deployment.
    module "cnm_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-workflow.zip"

    prefix = var.prefix
    name = "CNMExampleWorkflow"
    workflow_config = module.cumulus.workflow_config
    system_bucket = var.system_bucket

    {
    state_machine_definition = <<JSON
    "CNMExampleWorkflow": {
    "Comment": "CNMExampleWorkflow",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "collection": "{$.meta.collection}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "CnmResponse"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "response-endpoint": "ADD YOUR RESPONSE STREAM NAME HERE",
    "region": "us-east-1",
    "type": "kinesis",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$.input.input}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "IntervalSeconds": 5,
    "MaxAttempts": 3
    }
    ],
    "End": true
    }
    }
    }
    }
    JSON

    Again, please make sure to modify the value response-endpoint to match the stream name (not ARN) for your Kinesis response stream.

    Lambda Configuration

    To execute this workflow, you're required to include several Lambda resources in your deployment. To do this, add the following task (Lambda) definitions to your deployment along with the workflow you created above:

    Please note: To utilize these tasks you need to ensure you have a compatible CMA layer. See the deployment instructions for more details on how to deploy a CMA layer.

    Below is a description of each of these tasks:

    CNMToCMA

    CNMToCMA is meant for the beginning of a workflow: it maps CNM granule information to a payload for downstream tasks. For other CNM workflows, you would need to ensure that downstream tasks in your workflow either understand the CNM message or include a translation task like this one.

    You can also manipulate the data sent to downstream tasks using task_config for various states in your workflow resource configuration. Read more about how to configure data on the Workflow Input & Output page.

    CnmResponse

    The CnmResponse Lambda generates a CNM response message and puts it on the response-endpoint Kinesis stream.

    You can read more about the expected schema of a CnmResponse record in the Cloud Notification Mechanism schema repository.

    Additional Tasks

    Lastly, this entry also makes use of the SyncGranule task from the cumulus module.

    Redeploy

    Once the above configuration changes have been made, redeploy your stack.

    Please refer to Update Cumulus resources in the deployment documentation if you are unfamiliar with redeployment.

    Rule Configuration

    Cumulus includes a messageConsumer Lambda function (message-consumer). Cumulus kinesis-type rules create the event source mappings between Kinesis streams and the messageConsumer Lambda. The messageConsumer Lambda consumes records from one or more Kinesis streams, as defined by enabled kinesis-type rules. When new records are pushed to one of these streams, the messageConsumer triggers workflows associated with the enabled kinesis-type rules.

    To add a rule via the dashboard (if you'd like to use the API, see the docs here), navigate to the Rules page and click Add a rule, then configure the new rule using the following template (substituting correct values for parameters denoted by ${}):

    {
    "collection": {
    "name": "L2_HR_PIXC",
    "version": "000"
    },
    "name": "L2_HR_PIXC_kinesisRule",
    "provider": "PODAAC_SWOT",
    "rule": {
    "type": "kinesis",
    "value": "arn:aws:kinesis:{{awsRegion}}:{{awsAccountId}}:stream/{{streamName}}"
    },
    "state": "ENABLED",
    "workflow": "CNMExampleWorkflow"
    }

    Please Note:

    • The rule's value attribute value must match the Amazon Resource Name ARN for the Kinesis data stream you've preconfigured. You should be able to obtain this ARN from the Kinesis Dashboard entry for the selected stream.
    • The collection and provider should match the collection and provider you setup in the Prerequisites section.

    Once you've clicked on 'submit' a new rule should appear in the dashboard's Rule Overview.


    Execute the Workflow

    Once Cumulus has been redeployed and a rule has been added, we're ready to trigger the workflow and watch it execute.

    How to Trigger the Workflow

    To trigger matching workflows, you will need to put a record on the Kinesis stream that the message-consumer Lambda will recognize as a matching event. Most importantly, it should include a collection name that matches a valid collection.

    For the purpose of this example, the easiest way to accomplish this is using the AWS CLI.

    Create Record JSON

    Construct a JSON file containing an object that matches the values that have been previously setup. This JSON object should be a valid Cloud Notification Mechanism message.

    Please note: this example is somewhat contrived, as the downstream tasks don't care about most of these fields. A 'real' data ingest workflow would.

    The following values (denoted by ${} in the sample below) should be replaced to match values we've previously configured:

    • TEST_DATA_FILE_NAME: The filename of the test data that is available in the S3 (or other) provider we created earlier.
    • TEST_DATA_URI: The full S3 path to the test data (e.g. s3://bucket-name/path/granule)
    • COLLECTION: The collection name defined in the prerequisites for this product
    {
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "${TEST_DATA_FILE_NAME}",
    "checksum": "bogus_checksum_value",
    "uri": "${TEST_DATA_URI}",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "${TEST_DATA_FILE_NAME}",
    "dataVersion": "006"
    },
    "identifier ": "testIdentifier123456",
    "collection": "${COLLECTION}",
    "provider": "TestProvider",
    "version": "001",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Add Record to Kinesis Data Stream

    Using the JSON file you created, push it to the Kinesis notification stream:

    aws kinesis put-record --stream-name YOUR_KINESIS_NOTIFICATION_STREAM_NAME_HERE --partition-key 1 --data file:///path/to/file.json

    Please note: The above command uses the stream name, not the ARN.

    The command should return output similar to:

    {
    "ShardId": "shardId-000000000000",
    "SequenceNumber": "42356659532578640215890215117033555573986830588739321858"
    }

    This command will put a record containing the JSON from the --data flag onto the Kinesis data stream. The messageConsumer Lambda will consume the record and construct a valid CMA payload to trigger workflows. For this example, the record will trigger the CNMExampleWorkflow workflow as defined by the rule previously configured.

    You can view the current running executions on the Executions dashboard page which presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information.

    Verify Workflow Execution

    As detailed above, once the record is added to the Kinesis data stream, the messageConsumer Lambda will trigger the CNMExampleWorkflow .

    TranslateMessage

    TranslateMessage (which corresponds to the CNMToCMA Lambda) will take the CNM object payload and add a granules object to the CMA payload that's consistent with other Cumulus ingest tasks, and add a meta.cnm key (as well as the payload) to store the original message.

    For more on the Message Adapter, please see the Message Flow documentation.

    An example of what is happening in the CNMToCMA Lambda is as follows:

    Example Input Payload:

    "payload": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some_bucket/cumulus-test-data/pdrs/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198"
    }

    Example Output Payload:

      "payload": {
    "cnm": {
    "identifier ": "testIdentifier123456",
    "product": {
    "files": [
    {
    "checksumType": "md5",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "checksum": "bogus_checksum_value",
    "uri": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "type": "data",
    "size": 12345678
    }
    ],
    "name": "TestGranuleUR",
    "dataVersion": "006"
    },
    "version": "123456",
    "collection": "MOD09GQ",
    "provider": "TestProvider",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552"
    },
    "output": {
    "granules": [
    {
    "granuleId": "TestGranuleUR",
    "files": [
    {
    "path": "some-bucket/data",
    "url_path": "s3://some-bucket/cumulus-test-data/data/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "bucket": "some-bucket",
    "name": "MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "size": 12345678
    }
    ]
    }
    ]
    }
    }

    SyncGranules

    This Lambda will take the files listed in the payload and move them to s3://{deployment-private-bucket}/file-staging/{deployment-name}/{COLLECTION}/{file_name}.

    CnmResponse

    Assuming a successful execution of the workflow, this task will recover the meta.cnm key from the CMA output, and add a "SUCCESS" record to the notification Kinesis stream.

    If a prior step in the workflow has failed, this will add a "FAILURE" record to the stream instead.

    The data written to the response-endpoint should adhere to the Response Message Fields schema.

    Example CNM Success Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "SUCCESS"
    }
    }

    Example CNM Error Response:

    {
    "provider": "PODAAC_SWOT",
    "collection": "SWOT_Prod_l2:1",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier": "1234-abcd-efg0-9876",
    "response": {
    "status": "FAILURE",
    "errorCode": "PROCESSING_ERROR",
    "errorMessage": "File [cumulus-dev-a4d38f59-5e57-590c-a2be-58640db02d91/prod_20170926T11:30:36/production_file.nc] did not match gve checksum value."
    }
    }

    Note the CnmResponse state defined in the .tf workflow definition above configures $.exception to be passed to the CnmResponse Lambda keyed under config.WorkflowException. This is required for the CnmResponse code to deliver a failure response.

    To test the failure scenario, send a record missing the product.name key.


    Verify results

    Check for successful execution on the dashboard

    Following the successful execution of this workflow, you should expect to see the workflow complete successfully on the dashboard:

    Screenshot of a successful CNM workflow appearing on the executions page of the Cumulus dashboard

    Check the test granule has been delivered to S3 staging

    The test granule identified in the Kinesis record should be moved to the deployment's private staging area.

    Check for Kinesis records

    A SUCCESS notification should be present on the response-endpoint Kinesis stream.

    You should be able to validate the notification and response streams have the expected records with the following steps (the AWS CLI Kinesis Basic Stream Operations is useful to review before proceeding):

    Get a shard iterator (substituting your stream name as appropriate):

    aws kinesis get-shard-iterator \
    --shard-id shardId-000000000000 \
    --shard-iterator-type LATEST \
    --stream-name NOTIFICATION_OR_RESPONSE_STREAM_NAME

    which should result in an output to:

    {
    "ShardIterator": "VeryLongString=="
    }
    • Re-trigger the workflow by using the put-record command from
    • As the workflow completes, use the output from the get-shard-iterator command to request data from the stream:
    aws kinesis get-records --shard-iterator SHARD_ITERATOR_VALUE

    This should result in output similar to:

    {
    "Records": [
    {
    "SequenceNumber": "49586720336541656798369548102057798835250389930873978882",
    "ApproximateArrivalTimestamp": 1532664689.128,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjI4LjkxOSJ9",
    "PartitionKey": "1"
    },
    {
    "SequenceNumber": "49586720336541656798369548102059007761070005796999266306",
    "ApproximateArrivalTimestamp": 1532664707.149,
    "Data": "eyJpZGVudGlmaWVyICI6InRlc3RJZGVudGlmaWVyMTIzNDU2IiwidmVyc2lvbiI6IjAwNiIsImNvbGxlY3Rpb24iOiJNT0QwOUdRIiwicHJvdmlkZXIiOiJUZXN0UHJvdmlkZXIiLCJwcm9kdWN0U2l6ZSI6MTkwODYzNS4wLCJyZXNwb25zZSI6eyJzdGF0dXMiOiJTVUNDRVNTIn0sInByb2Nlc3NDb21wbGV0ZVRpbWUiOiIyMDE4LTA3LTI3VDA0OjExOjQ2Ljk1OCJ9",
    "PartitionKey": "1"
    }
    ],
    "NextShardIterator": "AAAAAAAAAAFo9SkF8RzVYIEmIsTN+1PYuyRRdlj4Gmy3dBzsLEBxLo4OU+2Xj1AFYr8DVBodtAiXbs3KD7tGkOFsilD9R5tA+5w9SkGJZ+DRRXWWCywh+yDPVE0KtzeI0andAXDh9yTvs7fLfHH6R4MN9Gutb82k3lD8ugFUCeBVo0xwJULVqFZEFh3KXWruo6KOG79cz2EF7vFApx+skanQPveIMz/80V72KQvb6XNmg6WBhdjqAA==",
    "MillisBehindLatest": 0
    }

    Note the data encoding is not human readable and would need to be parsed/converted to be interpretable. There are many options to build a Kineis consumer such as the KCL.

    For purposes of validating the workflow, it may be simpler to locate the workflow in the Step Function Management Console and assert the expected output is similar to the below examples.

    Successful CNM Response Object Example:

    {
    "cnmResponse": {
    "provider": "TestProvider",
    "collection": "MOD09GQ",
    "version": "123456",
    "processCompleteTime": "2017-09-30T03:45:29.791198",
    "submissionTime": "2017-09-30T03:42:29.791198",
    "receivedTime": "2017-09-30T03:42:31.634552",
    "identifier ": "testIdentifier123456",
    "response": {
    "status": "SUCCESS"
    }
    }
    }

    Kinesis Record Error Handling

    messageConsumer

    The default Kinesis stream processing in the Cumulus system is configured for record error tolerance.

    When the messageConsumer fails to process a record, the failure is captured and the record is published to the kinesisFallback SNS Topic. The kinesisFallback SNS topic broadcasts the record and a subscribed copy of the messageConsumer Lambda named kinesisFallback consumes these failures.

    At this point, the normal Lambda asynchronous invocation retry behavior will attempt to process the record 3 mores times. After this, if the record cannot successfully be processed, it is written to a dead letter queue. Cumulus' dead letter queue is an SQS Queue named kinesisFailure. Operators can use this queue to inspect failed records.

    This system ensures when messageConsumer fails to process a record and trigger a workflow, the record is retried 3 times. This retry behavior improves system reliability in case of any external service failure outside of Cumulus control.

    The Kinesis error handling system - the kinesisFallback SNS topic, messageConsumer Lambda, and kinesisFailure SQS queue - come with the API package and do not need to be configured by the operator.

    To examine records that were unable to be processed at any step you need to go look at the dead letter queue {{prefix}}-kinesisFailure. Check the Simple Queue Service (SQS) console. Select your queue, and under the Queue Actions tab, you can choose View/Delete Messages. Start polling for messages and you will see records that failed to process through the messageConsumer.

    Note, these are only records that occurred when processing records from Kinesis streams. Workflow failures are handled differently.

    Kinesis Stream logging

    Notification Stream messages

    Cumulus includes two Lambdas (KinesisInboundEventLogger and KinesisOutboundEventLogger) that utilize the same code to take a Kinesis record event as input, deserialize the data field and output the modified event to the logs.

    When a kinesis rule is created, in addition to the messageConsumer event mapping, an event mapping is created to trigger KinesisInboundEventLogger to record a log of the inbound record, to allow for analysis in case of unexpected failure.

    Response Stream messages

    Cumulus also supports this feature for all outbound messages. To take advantage of this feature, you will need to set an event mapping on the KinesisOutboundEventLogger Lambda that targets your response-endpoint. You can do this in the Lambda management page for KinesisOutboundEventLogger. Add a Kinesis trigger, and configure it to target the cnmResponseStream for your workflow:

    Screenshot of the AWS console showing configuration for Kinesis stream trigger on KinesisOutboundEventLogger Lambda

    Once this is done, all records sent to the response-endpoint will also be logged in CloudWatch. For more on configuring Lambdas to trigger on Kinesis events, please see creating an event source mapping.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/error-handling/index.html b/docs/v9.9.0/data-cookbooks/error-handling/index.html index 08e98cadcc2..8ce594928ab 100644 --- a/docs/v9.9.0/data-cookbooks/error-handling/index.html +++ b/docs/v9.9.0/data-cookbooks/error-handling/index.html @@ -5,7 +5,7 @@ Error Handling in Workflows | Cumulus Documentation - + @@ -45,7 +45,7 @@ Service Exception. See this documentation on configuring your workflow to handle transient lambda errors.

    Example state machine definition:

    {
    "Comment": "Tests Workflow from Kinesis Stream",
    "StartAt": "TranslateMessage",
    "States": {
    "TranslateMessage": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.cnm}",
    "destination": "{$.meta.cnm}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_to_cma_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "SyncGranule"
    },
    "SyncGranule": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "Path": "$.payload",
    "TargetPath": "$.payload"
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "buckets": "{$.meta.buckets}",
    "collection": "{$.meta.collection}",
    "downloadBucket": "{$.meta.buckets.private.name}",
    "stack": "{$.meta.stack}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$.granules}",
    "destination": "{$.meta.input_granules}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.sync_granule_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": ["States.ALL"],
    "IntervalSeconds": 10,
    "MaxAttempts": 3
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "CnmResponseFail"
    }
    ],
    "Next": "CnmResponse"
    },
    "CnmResponse": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowSucceeded"
    },
    "CnmResponseFail": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "OriginalCNM": "{$.meta.cnm}",
    "CNMResponseStream": "{$.meta.cnmResponseStream}",
    "region": "us-east-1",
    "WorkflowException": "{$.exception}",
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.meta.cnmResponse}"
    },
    {
    "source": "{$}",
    "destination": "{$.payload}"
    }
    ]
    }
    }
    }
    },
    "Type": "Task",
    "Resource": "${aws_lambda_function.cnm_response_task.arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": ["States.ALL"],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WorkflowFailed"
    },
    "WorkflowSucceeded": {
    "Type": "Succeed"
    },
    "WorkflowFailed": {
    "Type": "Fail",
    "Cause": "Workflow failed"
    }
    }
    }

    The above results in a workflow which is visualized in the diagram below:

    Screenshot of a visualization of an AWS Step Function workflow definition with branching logic for failures

    Summary

    Error handling should (mostly) be the domain of workflow configuration.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/hello-world/index.html b/docs/v9.9.0/data-cookbooks/hello-world/index.html index fad8c669156..5be1d12a6ab 100644 --- a/docs/v9.9.0/data-cookbooks/hello-world/index.html +++ b/docs/v9.9.0/data-cookbooks/hello-world/index.html @@ -5,14 +5,14 @@ HelloWorld Workflow | Cumulus Documentation - +
    Version: v9.9.0

    HelloWorld Workflow

    Example task meant to be a sanity check/introduction to the Cumulus workflows.

    Pre-Deployment Configuration

    Workflow Configuration

    A workflow definition can be found in the template repository hello_world_workflow module.

    {
    "Comment": "Returns Hello World",
    "StartAt": "HelloWorld",
    "States": {
    "HelloWorld": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${module.cumulus.hello_world_task.task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    }

    Workflow error-handling can be configured as discussed in the Error-Handling cookbook.

    Task Configuration

    The HelloWorld task is provided for you as part of the cumulus terraform module, no configuration is needed.

    If you want to manually deploy your own version of this Lambda for testing, you can copy the Lambda resource definition located in the Cumulus source code at cumulus/tf-modules/ingest/hello-world-task.tf. The Lambda source code is located in the Cumulus source code at 'cumulus/tasks/hello-world'.

    Execution

    We will focus on using the Cumulus dashboard to schedule the execution of a HelloWorld workflow.

    Our goal here is to create a rule through the Cumulus dashboard that will define the scheduling and execution of our HelloWorld workflow. Let's navigate to the Rules page and click Add a rule.

    {
    "collection": { # collection values can be configured and found on the Collections page
    "name": "${collection_name}",
    "version": "${collection_version}"
    },
    "name": "helloworld_rule",
    "provider": "${provider}", # found on the Providers page
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "workflow": "HelloWorldWorkflow" # This can be found on the Workflows page
    }

    Screenshot of AWS Step Function execution graph for the HelloWorld workflow Executed workflow as seen in AWS Console

    Output/Results

    The Executions page presents a list of all executions, their status (running, failed, or completed), to which workflow the execution belongs, along with other information. The rule defined in the previous section should start an execution of its own accord, and the status of that execution can be tracked here.

    To get some deeper information on the execution, click on the value in the Name column of your execution of interest. This should bring up a visual representation of the workflow similar to that shown above, execution details, and a list of events.

    Summary

    Setting up the HelloWorld workflow on the Cumulus dashboard is the tip of the iceberg, so to speak. The task and step-function need to be configured before Cumulus deployment. A compatible collection and provider must be configured and applied to the rule. Finally, workflow execution status can be viewed via the workflows tab on the dashboard.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/ingest-notifications/index.html b/docs/v9.9.0/data-cookbooks/ingest-notifications/index.html index b9ee1d475b2..f1b51799d11 100644 --- a/docs/v9.9.0/data-cookbooks/ingest-notifications/index.html +++ b/docs/v9.9.0/data-cookbooks/ingest-notifications/index.html @@ -5,13 +5,13 @@ Ingest Notification in Workflows | Cumulus Documentation - +
    Version: v9.9.0

    Ingest Notification in Workflows

    On deployment, an SQS queue and three SNS topics are created and used for handling notification messages related to the workflow.

    The sfEventSqsToDbRecords Lambda function reads from the sfEventSqsToDbRecordsInputQueue queue and updates DynamoDB. The DynamoDB events for the ExecutionsTable, GranulesTable and PdrsTable are streamed on DynamoDBStreams, which are read by the publishExecutions, publishGranules and publishPdrs Lambda functions, respectively.

    These Lambda functions publish to the three SNS topics both when the workflow starts and when it reaches a terminal state (completion or failure). The following describes how many message(s) each topic receives both on workflow start and workflow completion/failure:

    • reportExecutions - Receives 1 message per workflow execution
    • reportGranules - Receives 1 message per granule in a workflow execution
    • reportPdrs - Receives 1 message per PDR

    Diagram of architecture for reporting workflow ingest notifications from AWS Step Functions

    The ingest notification reporting SQS queue is populated via a Cloudwatch rule for any Step Function execution state transitions. The sfEventSqsToDbRecords Lambda consumes this queue. The queue and Lambda are included in the cumulus module and the Cloudwatch rule in the workflow module and are included by default in a Cumulus deployment.

    Sending SQS messages to report status

    Publishing granule/PDR reports directly to the SQS queue

    If you have a non-Cumulus workflow or process ingesting data and would like to update the status of your granules or PDRs, you can publish directly to the reporting SQS queue. Publishing messages to this queue will result in those messages being stored as granule/PDR records in the Cumulus database and having the status of those granules/PDRs being visible on the Cumulus dashboard. The queue does have certain expectations as it expects a Cumulus Message nested within a Cloudwatch Step Function Event object.

    Posting directly to the queue will require knowing the queue URL. Assuming that you are using the cumulus module for your deployment, you can get the queue URL by adding them to outputs.tf for your Terraform deployment as in our example deployment:

    output "stepfunction_event_reporter_queue_url" {
    value = module.cumulus.stepfunction_event_reporter_queue_url
    }

    output "report_executions_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_granules_sns_topic_arn" {
    value = module.cumulus.report_executions_sns_topic_arn
    }
    output "report_pdrs_sns_topic_arn" {
    value = module.cumulus.report_pdrs_sns_topic_arn
    }

    Then, when you run terraform deploy, you should see the topic ARNs printed to your console:

    Outputs:
    ...
    stepfunction_event_reporter_queue_url = https://sqs.us-east-1.amazonaws.com/xxxxxxxxx/<prefix>-sfEventSqsToDbRecordsInputQueue
    report_executions_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_granules_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-executions-topic
    report_pdrs_sns_topic_arn = arn:aws:sns:us-east-1:xxxxxxxxx:<prefix>-report-pdrs-topic

    Once you have the queue URL, you can use the AWS SDK for your language of choice to publish messages to the topic. The expected format of these messages is that of a Cloudwatch Step Function event containing a Cumulus message. For SUCCEEDED events, the Cumulus message is expected to be in detail.output. For all other events statuses, a Cumulus Message is expected in detail.input. The Cumulus Message populating these fields MUST be a JSON string, not an object. Messages that do not conform to the schemas will fail to be created as records.

    If you are not seeing records persist to the database or show up in the Cumulus dashboard, you can investigate the Cloudwatch logs of the SQS consumer Lambda:

    • /aws/lambda/<prefix>-sfEventSqsToDbRecords

    In a workflow

    As described above, ingest notifications will automatically be published to the SNS topics on workflow start and completion/failure, so you should not include a workflow step to publish the initial or final status of your workflows.

    However, if you want to report your ingest status at any point during a workflow execution, you can add a workflow step using the SfSqsReport Lambda. In the following example from cumulus-tf/parse_pdr_workflow.tf, the ParsePdr workflow is configured to use the SfSqsReport Lambda, primarily to update the PDR ingestion status.

    Note: ${sf_sqs_report_task_arn} is an interpolated value referring to a Terraform resource. See the example deployment code for the ParsePdr workflow.

      "PdrStatusReport": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    },
    "ResultPath": null,
    "Type": "Task",
    "Resource": "${sf_sqs_report_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "Next": "WaitForSomeTime"
    },

    Subscribing additional listeners to SNS topics

    Additional listeners to SNS topics can be configured in a .tf file for your Cumulus deployment. Shown below is configuration that subscribes an additional Lambda function (test_lambda) to receive messages from the report_executions SNS topic. To subscribe to the report_granules or report_pdrs SNS topics instead, simply replace report_executions in the code block below with either of those values.

    resource "aws_lambda_function" "test_lambda" {
    function_name = "${var.prefix}-testLambda"
    filename = "./testLambda.zip"
    source_code_hash = filebase64sha256("./testLambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"
    }

    resource "aws_sns_topic_subscription" "test_lambda" {
    topic_arn = module.cumulus.report_executions_sns_topic_arn
    protocol = "lambda"
    endpoint = aws_lambda_function.test_lambda.arn
    }

    resource "aws_lambda_permission" "test_lambda" {
    action = "lambda:InvokeFunction"
    function_name = aws_lambda_function.test_lambda.arn
    principal = "sns.amazonaws.com"
    source_arn = module.cumulus.report_executions_sns_topic_arn
    }

    SNS message format

    Subscribers to the SNS topics can expect to find the published message in the SNS event at Records[0].Sns.Message. The message will be a JSON stringified version of the ingest notification record for an execution or a PDR. For granules, the message will be a JSON stringified object with ingest notification record in the record property and the event type as the event property.

    The ingest notification record of the execution, granule, or PDR should conform to the data model schema for the given record type.

    Summary

    Workflows can be configured to send SQS messages at any point using the sf-sqs-report task.

    Additional listeners can be easily configured to trigger when messages are sent to the SNS topics.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/queue-post-to-cmr/index.html b/docs/v9.9.0/data-cookbooks/queue-post-to-cmr/index.html index 0acde3fbcb6..eb7c24a2b74 100644 --- a/docs/v9.9.0/data-cookbooks/queue-post-to-cmr/index.html +++ b/docs/v9.9.0/data-cookbooks/queue-post-to-cmr/index.html @@ -5,13 +5,13 @@ Queue PostToCmr | Cumulus Documentation - +
    Version: v9.9.0

    Queue PostToCmr

    In this document, we walktrough handling CMR errors in workflows by queueing PostToCmr. We assume that the user already has an ingest workflow setup.

    Overview

    The general concept is that the last task of the ingest workflow will be QueueWorkflow, which queues the publish workflow. The publish workflow contains the PostToCmr task and if a CMR error occurs during PostToCmr, the publish workflow will add itself back onto the queue so that it can be executed when CMR is back online. This is achieved by leveraging the QueueWorkflow task again in the publish workflow. The following diagram demonstrates this queueing process.

    Diagram of workflow queueing

    Ingest Workflow

    The last step should be the QueuePublishWorkflow step. It should be configured with a queueUrl and workflow. In this case, the queueUrl is a throttled queue. Any queueUrl can be specified here which is useful if you would like to use a lower priority queue. The workflow is the unprefixed workflow name that you would like to queue (e.g. PublishWorkflow).

      "QueuePublishWorkflowStep": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "workflow": "{$.meta.workflow}",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Publish Workflow

    Configure the Catch section of your PostToCmr task to proceed to QueueWorkflow if a CMRInternalError is caught. Any other error will cause the workflow to fail.

      "Catch": [
    {
    "ErrorEquals": [
    "CMRInternalError"
    ],
    "Next": "RequeueWorkflow"
    },
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],

    Then, configure the QueueWorkflow task similarly to its configuration in the ingest workflow. This time, pass the current publish workflow to the task config. This allows for the publish workflow to be requeued when there is a CMR error.

    {
    "RequeueWorkflow": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "buckets": "{$.meta.buckets}",
    "distribution_endpoint": "{$.meta.distribution_endpoint}",
    "workflow": "PublishGranuleQueue",
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_workflow_task_arn}",
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "Next": "WorkflowFailed",
    "ResultPath": "$.exception"
    }
    ],
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "End": true
    }
    }
    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html b/docs/v9.9.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html index 13659187b64..d5dc4cca97b 100644 --- a/docs/v9.9.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html +++ b/docs/v9.9.0/data-cookbooks/run-tasks-in-lambda-or-docker/index.html @@ -5,13 +5,13 @@ Run Step Function Tasks in AWS Lambda or Docker | Cumulus Documentation - +
    Version: v9.9.0

    Run Step Function Tasks in AWS Lambda or Docker

    Overview

    AWS Step Function Tasks can run tasks on AWS Lambda or on AWS Elastic Container Service (ECS) as a Docker container.

    Lambda provides serverless architecture, providing the best option for minimizing cost and server management. ECS provides the fullest extent of AWS EC2 resources via the flexibility to execute arbitrary code on any AWS EC2 instance type.

    When to use Lambda

    You should use AWS Lambda whenever all of the following are true:

    • The task runs on one of the supported Lambda Runtimes. At time of this writing, supported runtimes include versions of python, Java, Ruby, node.js, Go and .NET.
    • The lambda package is less than 50 MB in size, zipped.
    • The task consumes less than each of the following resources:
      • 3008 MB memory allocation
      • 512 MB disk storage (must be written to /tmp)
      • 15 minutes of execution time

    See this page for a complete and up-to-date list of AWS Lambda limits.

    If your task requires more than any of these resources or an unsupported runtime, creating a Docker image which can be run on ECS is the way to go. Cumulus supports running any lambda package (and its configured layers) as a Docker container with cumulus-ecs-task.

    Step Function Activities and cumulus-ecs-task

    Step Function Activities enable a state machine task to "publish" an activity task which can be picked up by any activity worker. Activity workers can run pretty much anywhere, but Cumulus workflows support the cumulus-ecs-task activity worker. The cumulus-ecs-task worker runs as a Docker container on the Cumulus ECS cluster.

    The cumulus-ecs-task container takes an AWS Lambda Amazon Resource Name (ARN) as an argument (see --lambdaArn in the example below). This ARN argument is defined at deployment time. The cumulus-ecs-task worker polls for new Step Function Activity Tasks. When a Step Function executes, the worker (container) picks up the activity task and runs the code contained in the lambda package defined on deployment.

    Example: Replacing AWS Lambda with a Docker container run on ECS

    This example will use an already-defined workflow from the cumulus module that includes the QueueGranules task in its configuration.

    The following example is an excerpt from the Discover Granules workflow containing the step definition for the QueueGranules step:

    Note: ${ingest_granule_workflow_name} and ${queue_granules_task_arn} are interpolated values that refer to Terraform resources. See the example deployment code for the Discover Granules workflow.

      "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "queueUrl": "{$.meta.queues.startSF}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },

    Given it has been discovered this task can no longer run in AWS Lambda, you can instead run it on the Cumulus ECS cluster by adding the following resources to your terraform deployment (by either adding a new .tf file or updating an existing one):

    • A aws_sfn_activity resource:
    resource "aws_sfn_activity" "queue_granules" {
    name = "${var.prefix}-QueueGranules"
    }
    • An instance of the cumulus_ecs_service module (found on the Cumulus releases page configured to provide the QueueGranules task:

    module "queue_granules_service" {
    source = "https://github.com/nasa/cumulus/releases/download/{version}/terraform-aws-cumulus-ecs-service.zip"

    prefix = var.prefix
    name = "QueueGranules"

    cluster_arn = module.cumulus.ecs_cluster_arn
    desired_count = 1
    image = "cumuluss/cumulus-ecs-task:1.7.0"

    cpu = 400
    memory_reservation = 700

    environment = {
    AWS_DEFAULT_REGION = data.aws_region.current.name
    }
    command = [
    "cumulus-ecs-task",
    "--activityArn",
    aws_sfn_activity.queue_granules.id,
    "--lambdaArn",
    module.cumulus.queue_granules_task.task_arn
    ]
    alarms = {
    TaskCountHigh = {
    comparison_operator = "GreaterThanThreshold"
    evaluation_periods = 1
    metric_name = "MemoryUtilization"
    statistic = "SampleCount"
    threshold = 1
    }
    }
    }

    Please note: If you have updated the code for the Lambda specified by --lambdaArn, you will have to manually restart the tasks in your ECS service before invocation of the Step Function activity will use the updated Lambda code.

    • An updated Discover Granules workflow) to utilize the new resource (the Resource key in the QueueGranules step has been updated to:

    "Resource": "${aws_sfn_activity.queue_granules.id}")`

    If you then run this workflow in place of the DiscoverGranules workflow, the QueueGranules step would run as an ECS task instead of a lambda.

    Final note

    Step Function Activities and AWS Lambda are not the only ways to run tasks in an AWS Step Function. Learn more about other service integrations, including direct ECS integration via the AWS Service Integrations page.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/sips-workflow/index.html b/docs/v9.9.0/data-cookbooks/sips-workflow/index.html index d34639ef436..de897f97f40 100644 --- a/docs/v9.9.0/data-cookbooks/sips-workflow/index.html +++ b/docs/v9.9.0/data-cookbooks/sips-workflow/index.html @@ -5,7 +5,7 @@ Science Investigator-led Processing Systems (SIPS) | Cumulus Documentation - + @@ -16,7 +16,7 @@ we're just going to create a onetime throw-away rule that will be easy to test with. This rule will kick off the DiscoverAndQueuePdrs workflow, which is the beginning of a Cumulus SIPS workflow:

    Screenshot of a Cumulus rule configuration

    Note: A list of configured workflows exists under the "Workflows" in the navigation bar on the Cumulus dashboard. Additionally, one can find a list of executions and their respective status in the "Executions" tab in the navigation bar.

    DiscoverAndQueuePdrs Workflow

    This workflow will discover PDRs and queue them to be processed. Duplicate PDRs will be dealt with according to the configured duplicate handling setting in the collection. The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. DiscoverPdrs - source
    2. QueuePdrs - source

    Screenshot of execution graph for discover and queue PDRs workflow in the AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the discover_and_queue_pdrs_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    ParsePdr Workflow

    The ParsePdr workflow will parse a PDR, queue the specified granules (duplicates are handled according to the duplicate handling setting) and periodically check the status of those queued granules. This workflow will not succeed until all the granules included in the PDR are successfully ingested. If one of those fails, the ParsePdr workflow will fail. NOTE that ParsePdr may spin up multiple IngestGranule workflows in parallel, depending on the granules included in the PDR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. ParsePdr - source
    2. QueueGranules - source
    3. CheckStatus - source

    Screenshot of execution graph for SIPS Parse PDR workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the parse_pdr_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    IngestGranule Workflow

    The IngestGranule workflow processes and ingests a granule and posts the granule metadata to CMR.

    The lambdas below are included in the cumulus terraform module for use in your workflows:

    1. SyncGranule - source.
    2. CmrStep - source

    Additionally this workflow requires a processing step you must provide. The ProcessingStep step in the workflow picture below is an example of a custom processing step.

    Note: Using the CmrStep is not required and can be left out of the processing trajectory if desired (for example, in testing situations).

    Screenshot of execution graph for SIPS IngestGranule workflow in AWS Step Functions console

    An example workflow module configuration can be viewed in the Cumulus source for the ingest_and_publish_granule_workflow.

    Please note: To use this example workflow module as a template for a new workflow in your deployment the source key for the workflow module would need to point to a release of the cumulus-workflow (terraform-aws-cumulus-workflow.zip) module on our release page, as all of the provided Cumulus workflows are internally self-referential.

    Summary

    In this cookbook we went over setting up a collection, rule, and provider for a SIPS workflow. Once we had the setup completed, we looked over the Cumulus workflows that participate in parsing PDRs, ingesting and processing granules, and updating CMR.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/throttling-queued-executions/index.html b/docs/v9.9.0/data-cookbooks/throttling-queued-executions/index.html index 5deae28eb9c..d651296f256 100644 --- a/docs/v9.9.0/data-cookbooks/throttling-queued-executions/index.html +++ b/docs/v9.9.0/data-cookbooks/throttling-queued-executions/index.html @@ -5,13 +5,13 @@ Throttling queued executions | Cumulus Documentation - +
    Version: v9.9.0

    Throttling queued executions

    In this entry, we will walkthrough how to create an SQS queue for scheduling executions which will be used to limit those executions to a maximum concurrency. And we will see how to configure our Cumulus workflows/rules to use this queue.

    We will also review the architecture of this feature and highlight some implementation notes.

    Limiting the number of executions that can be running from a given queue is useful for controlling the cloud resource usage of workflows that may be lower priority, such as granule reingestion or reprocessing campaigns. It could also be useful for preventing workflows from exceeding known resource limits, such as a maximum number of open connections to a data provider.

    Implementing the queue

    Create and deploy the queue

    Add a new queue

    In a .tf file for your Cumulus deployment, add a new SQS queue:

    resource "aws_sqs_queue" "background_job_queue" {
    name = "${var.prefix}-backgroundJobQueue"
    receive_wait_time_seconds = 20
    visibility_timeout_seconds = 60
    }

    Set maximum executions for the queue

    Define the throttled_queues variable for the cumulus module in your Cumulus deployment to specify the maximum concurrent executions for the queue.

    module "cumulus" {
    # ... other variables

    throttled_queues = [{
    url = aws_sqs_queue.background_job_queue.id,
    execution_limit = 5
    }]
    }

    Setup consumer for the queue

    Add the sqs2sfThrottle Lambda as the consumer for the queue and add a Cloudwatch event rule/target to read from the queue on a scheduled basis.

    Please note: You must use the sqs2sfThrottle Lambda as the consumer for any queue with a queue execution limit or else the execution throttling will not work correctly. Additionally, please allow at least 60 seconds after creation before using the queue while associated infrastructure and triggers are set up and made ready.

    aws_sqs_queue.background_job_queue.id refers to the queue resource defined above.

    resource "aws_cloudwatch_event_rule" "background_job_queue_watcher" {
    schedule_expression = "rate(1 minute)"
    }

    resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
    rule = aws_cloudwatch_event_rule.background_job_queue_watcher.name
    arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
    input = jsonencode({
    messageLimit = 500
    queueUrl = aws_sqs_queue.background_job_queue.id
    timeLimit = 60
    })
    }

    resource "aws_lambda_permission" "background_job_queue_watcher" {
    action = "lambda:InvokeFunction"
    function_name = module.cumulus.sqs2sfThrottle_lambda_function_arn
    principal = "events.amazonaws.com"
    source_arn = aws_cloudwatch_event_rule.background_job_queue_watcher.arn
    }

    Re-deploy your Cumulus application

    Follow the instructions to re-deploy your Cumulus application. After you have re-deployed, your workflow template will be updated to the include information about the queue (the output below is partial output from an expected workflow template):

    {
    "cumulus_meta": {
    "queueExecutionLimits": {
    "<backgroundJobQueue_SQS_URL>": 5
    }
    }
    }

    Integrate your queue with workflows and/or rules

    Integrate queue with queuing steps in workflows

    For any workflows using QueueGranules or QueuePdrs that you want to use your new queue, update the Cumulus configuration of those steps in your workflows.

    As seen in this partial configuration for a QueueGranules step, update the queueUrl to reference the new throttled queue:

    Note: ${ingest_granule_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverGranules workflow.

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}"
    }
    }
    }
    }
    }

    Similarly, for a QueuePdrs step:

    Note: ${parse_pdr_workflow_name} is an interpolated value referring to a Terraform resource. See the example deployment code for the DiscoverPdrs workflow.

    {
    "QueuePdrs": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${aws_sqs_queue.background_job_queue.id}",
    "provider": "{$.meta.provider}",
    "collection": "{$.meta.collection}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "parsePdrWorkflow": "${parse_pdr_workflow_name}"
    }
    }
    }
    }
    }

    After making these changes, re-deploy your Cumulus application for the execution throttling to take effect on workflow executions queued by these workflows.

    Create/update a rule to use your new queue

    Create or update a rule definition to include a queueUrl property that refers to your new queue:

    {
    "name": "s3_provider_rule",
    "workflow": "DiscoverAndQueuePdrs",
    "provider": "s3_provider",
    "collection": {
    "name": "MOD09GQ",
    "version": "006"
    },
    "rule": {
    "type": "onetime"
    },
    "state": "ENABLED",
    "queueUrl": "<backgroundJobQueue_SQS_URL>" // configure rule to use your queue URL
    }

    After creating/updating the rule, any subsequent invocations of the rule should respect the maximum number of executions when starting workflows from the queue.

    Architecture

    Architecture diagram showing how executions started from a queue are throttled to a maximum concurrent limit

    Execution throttling based on the queue works by manually keeping a count (semaphore) of how many executions are running for the queue at a time. The key operation that prevents the number of executions from exceeding the maximum for the queue is that before starting new executions, the sqs2sfThrottle Lambda attempts to increment the semaphore and responds as follows:

    • If the increment operation is successful, then the count was not at the maximum and an execution is started
    • If the increment operation fails, then the count was already at the maximum so no execution is started

    Final notes

    Limiting the number of concurrent executions for work scheduled via a queue has several consequences worth noting:

    • The number of executions that are running for a given queue will be limited to the maximum for that queue regardless of which workflow(s) are started.
    • If you use the same queue to schedule executions across multiple workflows/rules, then the limit on the total number of executions running concurrently will be applied to all of the executions scheduled across all of those workflows/rules.
    • If you are scheduling the same workflow both via a queue with a maxExecutions value and a queue without a maxExecutions value, only the executions scheduled via the queue with the maxExecutions value will be limited to the maximum.
    - + \ No newline at end of file diff --git a/docs/v9.9.0/data-cookbooks/tracking-files/index.html b/docs/v9.9.0/data-cookbooks/tracking-files/index.html index bd8ddb67a8c..77a89250805 100644 --- a/docs/v9.9.0/data-cookbooks/tracking-files/index.html +++ b/docs/v9.9.0/data-cookbooks/tracking-files/index.html @@ -5,7 +5,7 @@ Tracking Ancillary Files | Cumulus Documentation - + @@ -19,7 +19,7 @@ The UMM-G column reflects the RelatedURL's Type derived from the CNM type, whereas the ECHO10 column shows how the CNM type affects the destination element.

    CNM TypeUMM-G RelatedUrl.TypeECHO10 Location
    ancillary'VIEW RELATED INFORMATION'OnlineResource
    data'GET DATA'(HTTPS URL) or 'GET DATA VIA DIRECT ACCESS'(S3 URI)OnlineAccessURL
    browse'GET RELATED VISUALIZATION'AssociatedBrowseImage
    linkage'EXTENDED METADATA'OnlineResource
    metadata'EXTENDED METADATA'OnlineResource
    qa'EXTENDED METADATA'OnlineResource

    Common Use Cases

    This section briefly documents some common use cases and the recommended configuration for the file. The examples shown here are for the DiscoverGranules use case, which allows configuration at the Cumulus dashboard level. The other two cases covered in the ancillary metadata documentation require configuration at the provider notification level (either CNM message or PDR) and are not covered here.

    Configuring browse imagery:

    {
    "bucket": "public",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_[\\d]{1}.jpg$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_1.jpg",
    "type": "browse"
    }

    Configuring a documentation entry:

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_README.pdf$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_README.pdf",
    "type": "metadata"
    }

    Configuring other associated files (use types metadata or qa as appropriate):

    {
    "bucket": "protected",
    "regex": "^MOD09GQ\\.A[\\d]{7}\\.[\\S]{6}\\.006\\.[\\d]{13}\\_QA.txt$",
    "sampleFileName": "MOD09GQ.A2017025.h21v00.006.2017034065104_QA.txt",
    "type": "qa"
    }
    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/api-gateway-logging/index.html b/docs/v9.9.0/deployment/api-gateway-logging/index.html index ae14cc063d0..7523d6b0725 100644 --- a/docs/v9.9.0/deployment/api-gateway-logging/index.html +++ b/docs/v9.9.0/deployment/api-gateway-logging/index.html @@ -5,13 +5,13 @@ API Gateway Logging | Cumulus Documentation - +
    Version: v9.9.0

    API Gateway Logging

    Enabling API Gateway logging

    In order to enable distribution API Access and execution logging, configure the TEA deployment by setting log_api_gateway_to_cloudwatch on the thin_egress_app module:

    log_api_gateway_to_cloudwatch = true

    This enables the distribution API to send its logs to the default CloudWatch location: API-Gateway-Execution-Logs_<RESTAPI_ID>/<STAGE>

    Configure Permissions for API Gateway Logging to CloudWatch

    Instructions for enabling account level logging from API Gateway to CloudWatch

    This is a one time operation that must be performed on each AWS account to allow API Gateway to push logs to CloudWatch.

    Create a policy document

    The AmazonAPIGatewayPushToCloudWatchLogs managed policy, with an ARN of arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs, has all the required permissions to enable API Gateway logging to CloudWatch. To grant these permissions to your account, first create an IAM role with apigateway.amazonaws.com as its trusted entity.

    Save this snippet as apigateway-policy.json.

    {
    "Version": "2012-10-17",
    "Statement": [
    {
    "Sid": "",
    "Effect": "Allow",
    "Principal": {
    "Service": "apigateway.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
    }
    ]
    }

    Create an account role to act as ApiGateway and write to CloudWatchLogs

    NASA users in NGAP: be sure to use your account's permission boundary.

    aws iam create-role \
    --role-name ApiGatewayToCloudWatchLogs \
    [--permissions-boundary <permissionBoundaryArn>] \
    --assume-role-policy-document file://apigateway-policy.json

    Note the ARN of the returned role for the last step.

    Attach correct permissions to role

    Next attach the AmazonAPIGatewayPushToCloudWatchLogs policy to the IAM role.

    aws iam attach-role-policy \
    --role-name ApiGatewayToCloudWatchLogs \
    --policy-arn "arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs"

    Update Account API Gateway settings with correct permissions

    Finally, set the IAM role ARN on the cloudWatchRoleArn property on your API Gateway Account settings.

    aws apigateway update-account \
    --patch-operations op='replace',path='/cloudwatchRoleArn',value='<ApiGatewayToCloudWatchLogs ARN>'

    Configure API Gateway CloudWatch Logs Delivery

    See Configure Cloudwatch Logs Delivery

    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/cloudwatch-logs-delivery/index.html b/docs/v9.9.0/deployment/cloudwatch-logs-delivery/index.html index 02337da4ab2..0512bd47f0f 100644 --- a/docs/v9.9.0/deployment/cloudwatch-logs-delivery/index.html +++ b/docs/v9.9.0/deployment/cloudwatch-logs-delivery/index.html @@ -5,13 +5,13 @@ Configure Cloudwatch Logs Delivery | Cumulus Documentation - +
    Version: v9.9.0

    Configure Cloudwatch Logs Delivery

    As an optional configuration step, it is possible to deliver CloudWatch logs to a cross-account shared AWS::Logs::Destination. An operator does this by configuring the cumulus module for your deployment as shown below. The value of the log_destination_arn variable is the ARN of a writeable log destination.

    The value can be either an AWS::Logs::Destination or a Kinesis Stream ARN to which your account can write.

    log_destination_arn           = arn:aws:[kinesis|logs]:us-east-1:123456789012:[streamName|destination:logDestinationName]

    Logs Sent

    Be default, the following logs will be sent to the destination when one is given.

    • Ingest logs
    • Async Operation logs
    • Thin Egress App API Gateway logs (if configured)

    Additional Logs

    If additional logs are needed, you can configure additional_log_groups_to_elk with the Cloudwatch log groups you want to send to the destination. additional_log_groups_to_elk is a map with the key as a descriptor and the value with the Cloudwatch log group name.

    additional_log_groups_to_elk = {
    "HelloWorldTask" = "/aws/lambda/cumulus-example-HelloWorld"
    "MyCustomTask" = "my-custom-task-log-group"
    }
    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/components/index.html b/docs/v9.9.0/deployment/components/index.html index af5fd3da14c..d96beda9a79 100644 --- a/docs/v9.9.0/deployment/components/index.html +++ b/docs/v9.9.0/deployment/components/index.html @@ -5,7 +5,7 @@ Component-based Cumulus Deployment | Cumulus Documentation - + @@ -39,7 +39,7 @@ Terraform at the same time.

    With remote state, Terraform writes the state data to a remote data store, which can then be shared between all members of a team.

    The recommended approach for handling remote state with Cumulus is to use the S3 backend. This backend stores state in S3 and uses a DynamoDB table for locking.

    See the deployment documentation for a walkthrough of creating resources for your remote state using an S3 backend.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/create_bucket/index.html b/docs/v9.9.0/deployment/create_bucket/index.html index 08245b395ed..a3d30a4693d 100644 --- a/docs/v9.9.0/deployment/create_bucket/index.html +++ b/docs/v9.9.0/deployment/create_bucket/index.html @@ -5,13 +5,13 @@ Creating an S3 Bucket | Cumulus Documentation - +
    Version: v9.9.0

    Creating an S3 Bucket

    Buckets can be created on the command line with AWS CLI or via the web interface on the AWS console.

    When creating a protected bucket (a bucket containing data which will be served through the distribution API), make sure to enable S3 server access logging. See S3 Server Access Logging for more details.

    Command line

    Using the AWS command line tool create-bucket s3api subcommand:

    $ aws s3api create-bucket \
    --bucket foobar-internal \
    --region us-west-2 \
    --create-bucket-configuration LocationConstraint=us-west-2
    {
    "Location": "/foobar-internal"
    }

    Note: The region and create-bucket-configuration arguments are only necessary if you are creating a bucket outside of the us-east-1 region.

    Please note security settings and other bucket options can be set via the options listed in the s3api documentation.

    Repeat the above step for each bucket to be created.

    Web interface

    See: AWS "Creating a Bucket" documentation

    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/cumulus_distribution/index.html b/docs/v9.9.0/deployment/cumulus_distribution/index.html index ae92d24865a..17c9895ef56 100644 --- a/docs/v9.9.0/deployment/cumulus_distribution/index.html +++ b/docs/v9.9.0/deployment/cumulus_distribution/index.html @@ -5,14 +5,14 @@ Using the Cumulus Distribution API | Cumulus Documentation - +
    Version: v9.9.0

    Using the Cumulus Distribution API

    The Cumulus Distribution API is a set of endpoints that can be used to enable AWS Cognito authentication when downloading data from S3.

    Configuring a Cumulus Distribution deployment

    The Cumulus Distribution API is included in the main Cumulus repo. It is available as part of the terraform-aws-cumulus.zip archive in the latest release.

    These steps assume you're using the Cumulus Deployment Template but can also be used for custom deployments.

    To configure a deployment to use Cumulus Distribution:

    1. Remove or comment the "Thin Egress App Settings" in the Cumulus Template Deploy and enable the Cumulus Distribution settings.
    2. Delete or comment the contents of thin_egress_app.tf and the corresponding Thin Egress App outputs in outputs.tf. These are not necessary for a Cumulus Distribution deployment.
    3. Uncomment the Cumulus Distribution outputs in outputs.tf.
    4. Rename cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.example to cumulus-template-deploy/cumulus-tf/cumulus_distribution.tf.

    Cognito Application and User Credentials

    The major prerequisite for using the Cumulus Distribution API is to set up Cognito. If operating within NGAP, this should already be done for you. If operating outside of NGAP, you must set up Cognito yourself, which is beyond the scope of this documentation.

    Given that Cognito is set up, in order to be able to download granule files via the Cumulus Distribution API, you must obtain Cognito user credentials, because any attempt to download such files (that will be, or have been, published to the CMR via your Cumulus deployment) will result in a prompt for you to supply Cognito user credentials. To obtain your own user credentials, talk to your product owner or scrum master for additional information. They should either know how to create the credentials, know who can create them for the team, or be the liaison to the Cognito team.

    Further, whoever helps to obtain your Cognito user credentials should also be able to supply you with the values for the following new variables that you must add to your cumulus-tf/terraform.tfvars file:

    • csdap_host_url: The URL of the Cognito service to which your Cumulus deployment will make Cognito API calls during a distribution (download) event
    • csdap_client_id: The client ID for the Cumulus application registered within the Cognito service
    • csdap_client_password: The client password for the Cumulus application registered within the Cognito service

    Although you might have to wait a bit for your Cognito user credentials, the remaining instructions do not depend upon having them, so you may continue with these instructions while waiting for your credentials.

    Cumulus Distribution URL

    Your Cumulus Distribution URL is used by Cumulus to generate download URLs as part of the granule metadata generated and published to the CMR. For example, a granule download URL will be of the form <distribution url>/<protected bucket>/<key> (or <distribution url>/path/to/file, if using a custom bucket map, as explained further below).

    By default, the value of your distribution URL is the URL of your private Cumulus Distribution API Gateway (the API Gateway named <prefix>-distribution, once you deploy the Cumulus Distribution module). Therefore, by default, the generated download URLs are private, and thus inaccessible directly, but there are 2 ways to address this issue (both of which are detailed below): (a) use tunneling (typically in development) or (b) put a CloudFront URL in front of your API Gateway (typically in production, and perhaps UAT and/or SIT).

    In either case, you must first know the default URL (i.e., the URL for the private Cumulus Distribution API Gateway). In order to obtain this default URL, you must first deploy your cumulus-tf module with the new Cumulus Distribution module, and once your initial deployment is complete, one of the Terraform outputs will be cumulus_distribution_api_uri, which is the URL for the private API Gateway.

    You may override this default URL by adding a cumulus_distribution_url variable to your cumulus-tf/terraform.tfvars file, and setting it to one of the following values (both of which are explained below):

    1. The default URL, but with a port added to it, in order to allow you to configure tunneling (typically only in development)
    2. A CloudFront URL placed in front of your Cumulus Distribution API Gateway (typically only for Production, but perhaps also for a UAT or SIT environment)

    The following subsections explain these approaches, in turn.

    Using your Cumulus Distribution API Gateway URL as your distribution URL

    Since your Cumulus Distribution API Gateway URL is private, the only way you can use it to confirm that your integration with Cognito is working is by using tunneling (again, generally for development), as described here. Here is an outline of the required steps, with details provided further below:

    1. Create/import a key pair into your AWS EC2 service (if you haven't already done so)
    2. Add a reference to the name of the key pair to your Terraform variables (we'll set the key_name Terraform variable)
    3. Choose an open local port on your machine (we'll use 9000 in the following details)
    4. Add a reference to the value of your cumulus_distribution_api_uri (mentioned earlier), including your chosen port (we'll set the cumulus_distribution_url Terraform variable)
    5. Redeploy Cumulus
    6. Add an entry to your /etc/hosts file
    7. Add a redirect URI to Cognito, via the Cognito API
    8. Install the Session Manager Plugin for the AWS CLI (if you haven't already done so; assuming you have already installed the AWS CLI)
    9. Add a sample file to S3 to test downloading via Cognito

    To create or import an existing key pair, you can use the AWS CLI (see aws ec2 import-key-pair), or the AWS Console (see Amazon EC2 key pairs and Linux instances).

    Once your key pair is added to AWS, add the following to your cumulus-tf/terraform.tfvars file:

    key_name = "<name>"
    cumulus_distribution_url = "https://<id>.execute-api.<region>.amazonaws.com:<port>/dev/"

    where:

    • <name> is the name of the key pair you just added to AWS
    • <id> and <region> are the corresponding parts from your cumulus_distribution_api_uri output variable
    • <port> is your open local port of choice (9000 is typically a good choice)

    Once you save your variable changes, redeploy your cumulus-tf module.

    While your deployment runs, add the following entry to your /etc/hosts file, replacing <hostname> with the host name of the cumulus_distribution_url Terraform variable you just added above:

    localhost <hostname>

    Next, you'll need to use the Cognito API to add the value of your cumulus_distribution_url Terraform variable as a Cognito redirect URI. To do so, use your favorite tool (e.g., curl, wget, Postman, etc.) to make a BasicAuth request to the Cognito API, using the following details:

    • method: POST
    • base URL: the value of your csdap_host_url Terraform variable
    • path: /authclient/updateRedirectUri
    • username: the value of your csdap_client_id Terraform variable
    • password: the value of your csdap_client_password Terraform variable
    • headers: Content-Type='application/x-www-form-urlencoded'
    • body: redirect_uri=<cumulus_distribution_url>/login

    where <cumulus_distribution_url> is the value of your cumulus_distribution_url Terraform variable. Note the /login path at the end of the redirect_uri value.

    For reference, see the Cognito Authentication Service API.

    Next, install the Session Manager Plugin for the AWS CLI. If running on macOS, and you use Homebrew, you can install it simply as follows:

    brew install --cask session-manager-plugin --no-quarantine

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    At this point, you should be ready to open a tunnel and attempt to download your sample file via your browser, summarized as follows:

    1. Determine your ec2 instance ID
    2. Connect to the NASA VPN
    3. Start an AWS SSM session
    4. Open an ssh tunnel
    5. Use a browser to navigate to your file

    To determine your ec2 instance ID for your Cumulus deployment, run the follow command, where <profile> is the name of the appropriate AWS profile to use, and <prefix> is the value of your prefix Terraform variable:

    aws --profile <profile> ec2 describe-instances --filters Name=tag:Deployment,Values=<prefix> Name=instance-state-name,Values=running --query "Reservations[0].Instances[].InstanceId" --output text

    IMPORTANT: Before proceeding with the remaining steps, make sure you're connected to the NASA VPN.

    Use the value output from the command above in place of <id> in the following command, which will start an SSM session:

    aws ssm start-session --target <id> --document-name AWS-StartPortForwardingSession --parameters portNumber=22,localPortNumber=6000

    If successful, you should see output similar to the following:

    Starting session with SessionId: NGAPShApplicationDeveloper-***
    Port 6000 opened for sessionId NGAPShApplicationDeveloper-***.
    Waiting for connections...

    Open another terminal window, and open a tunnel with port forwarding, using your chosen port from above (e.g., 9000):

    ssh -4 -p 6000 -N -L <port>:<api-gateway-host>:443 ec2-user@127.0.0.1

    where:

    • <port> is the open local port you chose earlier (e.g., 9000)
    • <api-gateway-host> is the hostname of your private API Gateway (i.e., the host portion of the URL you used as the value of your cumulus_distribution_url Terraform variable above)

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3 above.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    Once you're finished testing, clean up as follows:

    1. Kill your ssh tunnel (Ctrl-C)
    2. Kill your AWS SSM session (Ctrl-C)
    3. If you like, disconnect from the NASA VPC

    While this is a relatively lengthy process, things are much easier when using CloudFront, such as in Production (OPS), SIT, or UAT, as explained next.

    Using a CloudFront URL as your distribution URL

    In Production (OPS), and perhaps in other environments, such as UAT and SIT, you'll need to provide a publicly accessible URL for users to use for downloading (distributing) granule files.

    This is generally done by placing a CloudFront URL in front of your private Cumulus Distribution API Gateway. In order to create such a CloudFront URL, contact the person who helped you obtain your Cognito credentials, and request a CloudFront URL with the following details:

    • The private, backing URL, which is the value of your cumulus_distribution_api_uri Terraform output value
    • A request to add the AWS account's VPC to the whitelist

    Once this request is completed, and you obtain the new CloudFront URL, override your default distribution URL with the CloudFront URL by adding the following to your cumulus-tf/terraform.tfvars file:

    cumulus_distribution_url = <cloudfront_url>

    In addition, add a Cognito redirect URI, as detailed in the previous section. Note that in this case, the value you'll use for redirect_uri is <cloudfront_url>/login since the value of your cumulus_distribution_url is now your CloudFront URL.

    At this point, it is assumed that you have added the appropriate values for this environment for the variables described at the top (csdap_host_url, csdap_client_id, and csdap_client_password).

    Redeploy Cumulus with your new/updated Terraform variables.

    As your final setup step, add a sample file to one of the protected buckets listed in your buckets Terraform variable in your cumulus-tf/terraform.tfvars file. The key for the S3 object doesn't matter, nor does it matter what file you use. All that matters is that the file is an S3 object in one of your protected buckets, because Cognito is triggered when attempting to download from one of those buckets.

    Finally, use your chosen browser to navigate to <cumulus_distribution_url>/<bucket>/<key>, where <bucket> and <key> reference the sample file you added to S3.

    If all goes well, you should be prompted for your Cognito username and password. If you have obtained your Cognito user credentials, enter them, followed by entering a code generated by the authenticator application you registered at the time you completed your Cognito registration process. Once your credentials and auth code are correctly supplied, after a few moments, the download process will begin.

    S3 Bucket Mapping

    An S3 Bucket map allows users to abstract bucket names. If the bucket names change at any point, only the bucket map would need to be updated instead of every S3 link.

    The Cumulus Distribution API uses a bucket_map.yaml or bucket_map.yaml.tmpl file to determine which buckets to serve. See the examples.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Note: Cumulus only supports a one-to-one mapping of bucket -> Cumulus Distribution path for 'distribution' buckets. Also, the bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Switching from the Thin Egress App to Cumulus Distribution

    If you have previously deployed the Thin Egress App (TEA) as your distribution app, you can switch to Cumulus Distribution by following the steps above.

    Note, however, that the cumulus_distribution module will generate a bucket map cache and overwrite any existing bucket map caches created by TEA.

    There will also be downtime while your API gateway is updated.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/index.html b/docs/v9.9.0/deployment/index.html index dba30472d41..41e7a673911 100644 --- a/docs/v9.9.0/deployment/index.html +++ b/docs/v9.9.0/deployment/index.html @@ -5,7 +5,7 @@ How to Deploy Cumulus | Cumulus Documentation - + @@ -21,7 +21,7 @@ for deployment's EC2 instances and allows you to connect to them via SSH/SSM.

    Consider the sizing of your Cumulus instance when configuring your variables.

    Choose a distribution API

    Cumulus can be configured to use either the Thin Egress App (TEA) or the Cumulus Distribution API. The default selection is the Thin Egress App if you're using the Deployment Template.

    IMPORTANT! If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    Configure the Thin Egress App

    The Thin Egress App can be used for Cumulus distribution and is the default selection. It allows authentication using Earthdata Login. Follow the steps in the documentation to configure distribution in your cumulus-tf deployment.

    Configure the Cumulus Distribution API (optional)

    If you would prefer to use the Cumulus Distribution API, which supports AWS Cognito authentication, follow these steps to configure distribution in your cumulus-tf deployment.

    Initialize Terraform

    Follow the above instructions to initialize Terraform using terraform init3.

    Deploy

    Run terraform apply to deploy the resources. Type yes when prompted to confirm that you want to create the resources. Assuming the operation is successful, you should see output like this:

    Apply complete! Resources: 292 added, 0 changed, 0 destroyed.

    Outputs:

    archive_api_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/token
    archive_api_uri = https://abc123.execute-api.us-east-1.amazonaws.com/dev/
    distribution_redirect_uri = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/login
    distribution_url = https://abc123.execute-api.us-east-1.amazonaws.com/DEV/

    Note: Be sure to copy the redirect URLs, as you will use them to update your Earthdata application.

    Update Earthdata Application

    You will need to add two redirect URLs to your EarthData login application.

    1. Login to URS.
    2. Under My Applications -> Application Administration -> use the edit icon of your application.
    3. Under Manage -> redirect URIs, add the Archive API url returned from the stack deployment
      • e.g. archive_api_redirect_uri = https://<czbbkscuy6>.execute-api.us-east-1.amazonaws.com/dev/token.
    4. Also add the Distribution url
      • e.g. distribution_redirect_uri = https://<kido2r7kji>.execute-api.us-east-1.amazonaws.com/dev/login1.
    5. You may delete the placeholder url you used to create the application.

    If you've lost track of the needed redirect URIs, they can be located on the API Gateway. Once there, select <prefix>-archive and/or <prefix>-thin-egress-app-EgressGateway, Dashboard and utilizing the base URL at the top of the page that is accompanied by the text Invoke this API at:. Make sure to append /token for the archive URL and /login to the thin egress app URL.


    Deploy Cumulus dashboard

    Dashboard Requirements

    Please note that the requirements are similar to the Cumulus stack deployment requirements. The installation instructions below include a step that will install/use the required node version referenced in the .nvmrc file in the dashboard repository.

    Prepare AWS

    Create S3 bucket for dashboard:

    • Create it, e.g. <prefix>-dashboard. Use the command line or console as you did when preparing AWS configuration.
    • Configure the bucket to host a website:
      • AWS S3 console: Select <prefix>-dashboard bucket then, "Properties" -> "Static Website Hosting", point to index.html
      • CLI: aws s3 website s3://<prefix>-dashboard --index-document index.html
    • The bucket's url will be http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or you can find it on the AWS console via "Properties" -> "Static website hosting" -> "Endpoint"
    • Ensure the bucket's access permissions allow your deployment user access to write to the bucket

    Install dashboard

    To install the dashboard, clone the Cumulus dashboard repository into the root deploy directory and install dependencies with npm install:

      git clone https://github.com/nasa/cumulus-dashboard
    cd cumulus-dashboard
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Dashboard versioning

    By default, the master branch will be used for dashboard deployments. The master branch of the dashboard repo contains the most recent stable release of the dashboard.

    If you want to test unreleased changes to the dashboard, use the develop branch.

    Each release/version of the dashboard will have a tag in the dashboard repo. Release/version numbers will use semantic versioning (major/minor/patch).

    To checkout and install a specific version of the dashboard:

      git fetch --tags
    git checkout <version-number> # e.g. v1.2.0
    nvm use
    npm install

    If you do not have the correct version of node installed, replace nvm use with nvm install $(cat .nvmrc) in the above example.

    Building the dashboard

    Note: These environment variables are available during the build: APIROOT, DAAC_NAME, STAGE, HIDE_PDR. Any of these can be set on the command line to override the values contained in config.js when running the build below.

    To configure your dashboard for deployment, set the APIROOT environment variable to your app's API root.2

    Build the dashboard from the dashboard repository root directory, cumulus-dashboard:

      APIROOT=<your_api_root> npm run build

    Dashboard deployment

    Deploy dashboard to s3 bucket from the cumulus-dashboard directory:

    Using AWS CLI:

      aws s3 sync dist s3://<prefix>-dashboard --acl public-read

    From the S3 Console:

    • Open the <prefix>-dashboard bucket, click 'upload'. Add the contents of the 'dist' subdirectory to the upload. Then select 'Next'. On the permissions window allow the public to view. Select 'Upload'.

    You should be able to visit the dashboard website at http://<prefix>-dashboard.s3-website-<region>.amazonaws.com or find the url <prefix>-dashboard -> "Properties" -> "Static website hosting" -> "Endpoint" and login with a user that you configured for access in the Configure and Deploy the Cumulus Stack step.


    Cumulus Instance Sizing

    The Cumulus deployment default sizing for Elasticsearch instances, EC2 instances, and Autoscaling Groups are small and designed for testing and cost savings. The default settings are likely not suitable for production workloads. Sizing is highly individual and dependent on expected load and archive size.

    Please be cognizant of costs as any change in size will affect your AWS bill. AWS provides a pricing calculator for estimating costs.

    Elasticsearch

    The mappings file contains all of the data types that will be indexed into Elasticsearch. Elasticsearch sizing is tied to your archive size, including your collections, granules, and workflow executions that will be stored.

    AWS provides documentation on calculating and configuring for sizing.

    In addition to size you'll want to consider the number of nodes which determine how the system reacts in the event of a failure.

    Configuration can be done in the data persistence module in elasticsearch_config and the cumulus module in es_index_shards.

    If you make changes to your Elasticsearch configuration you will need to reindex for those changes to take effect.

    EC2 instances and autoscaling groups

    EC2 instances are used for long-running operations (i.e. generating a reconciliation report) and long-running workflow tasks. Configuration for your ECS cluster is achieved via Cumulus deployment variables.

    When configuring your ECS cluster consider:

    • The EC2 instance type and EBS volume size needed to accommodate your workloads. Configured as ecs_cluster_instance_type and ecs_cluster_instance_docker_volume_size.
    • The minimum and desired number of instances on hand to accommodate your workloads. Configured as ecs_cluster_min_size and ecs_cluster_desired_size.
    • The maximum number of instances you will need and are willing to pay for to accommodate your heaviest workloads. Configured as ecs_cluster_max_size.
    • Your autoscaling parameters: ecs_cluster_scale_in_adjustment_percent, ecs_cluster_scale_out_adjustment_percent, ecs_cluster_scale_in_threshold_percent, and ecs_cluster_scale_out_threshold_percent.

    Footnotes


    1. Run terraform init if:

      • This is the first time deploying the module
      • You have added any additional child modules, including Cumulus components
      • You have updated the source for any of the child modules

    2. To add another redirect URIs to your application. On Earthdata home page, select "My Applications". Scroll down to "Application Administration" and use the edit icon for your application. Then Manage -> Redirect URIs.

    3. The API root can be found a number of ways. The easiest is to note it in the output of the app deployment step. But you can also find it from the AWS console -> Amazon API Gateway -> APIs -> <prefix>-archive -> Dashboard, and reading the URL at the top after "Invoke this API at"

    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/postgres_database_deployment/index.html b/docs/v9.9.0/deployment/postgres_database_deployment/index.html index b6941fb4101..8d014fbe56e 100644 --- a/docs/v9.9.0/deployment/postgres_database_deployment/index.html +++ b/docs/v9.9.0/deployment/postgres_database_deployment/index.html @@ -5,7 +5,7 @@ PostgreSQL Database Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ cumulus-rds-tf that will deploy an AWS RDS Aurora Serverless PostgreSQL 10.2 compatible database cluster, and optionally provision a single deployment database with credentialed secrets for use with Cumulus.

    We have provided an example terraform deployment using this module in the Cumulus template-deploy repository on github.

    Use of this example involves:

    • Creating/configuring a Terraform module directory
    • Using Terraform to deploy resources to AWS

    Requirements

    Configuration/installation of this module requires the following:

    • Terraform
    • git
    • A VPC configured for use with Cumulus Core. This should match the subnets you provide when Deploying Cumulus to allow Core's lambdas to properly access the database.
    • At least two subnets across multiple AZs. These should match the subnets you provide as configuration when Deploying Cumulus, and should be within the same VPC.

    Needed Git Repositories

    Assumptions

    OS/Environment

    The instructions in this module require Linux/MacOS. While deployment via Windows is possible, it is unsupported.

    Terraform

    This document assumes knowledge of Terraform. If you are not comfortable working with Terraform, the following links should bring you up to speed:

    For Cumulus specific instructions on installation of Terraform, refer to the main Cumulus Installation Documentation

    Aurora/RDS

    This document also assumes some basic familiarity with PostgreSQL databases, and Amazon Aurora/RDS. If you're unfamiliar consider perusing the AWS docs, and the Aurora Serverless V1 docs.

    Prepare deployment repository

    If you already are working with an existing repository that has a configured rds-cluster-tf deployment for the version of Cumulus you intend to deploy or update, or just need to configure this module for your repository, skip to Prepare AWS configuration.

    Clone the cumulus-template-deploy repo and name appropriately for your organization:

      git clone https://github.com/nasa/cumulus-template-deploy <repository-name>

    We will return to configuring this repo and using it for deployment below.

    Optional: Create a new repository

    Create a new repository on Github so that you can add your workflows and other modules to source control:

      git remote set-url origin https://github.com/<org>/<repository-name>
    git push origin master

    You can then add/commit changes as needed.

    Note: If you are pushing your deployment code to a git repo, make sure to add terraform.tf and terraform.tfvars to .gitignore, as these files will contain sensitive data related to your AWS account.


    Prepare AWS configuration

    To deploy this module, you need to make sure that you have the following steps from the Cumulus deployment instructions in similar fashion for this module:

    --

    Configure and deploy the module

    When configuring this module, please keep in mind that unlike Cumulus deployment, this module should be deployed once to create the database cluster and only thereafter to make changes to that configuration/upgrade/etc. This module does not need to be re-deployed for each Core update.

    These steps should be executed in the rds-cluster-tf directory of the template deploy repo that you previously cloned. Run the following to copy the example files:

    cd rds-cluster-tf/
    cp terraform.tf.example terraform.tf
    cp terraform.tfvars.example terraform.tfvars

    In terraform.tf, configure the remote state settings by substituting the appropriate values for:

    • bucket
    • dynamodb_table
    • PREFIX (whatever prefix you've chosen for your deployment)

    Fill in the appropriate values in terraform.tfvars. See the rds-cluster-tf module variable definitions for more detail on all of the configuration options. A few notable configuration options are documented in the next section.

    Configuration Options

    • deletion_protection -- defaults to true. Set it to false if you want to be able to delete your cluster with a terraform destroy without manually updating the cluster.
    • db_admin_username -- cluster database administration username. Defaults to postgres.
    • db_admin_password -- required variable that specifies the admin user password for the cluster. To randomize this on each deployment, consider using a random_string resource as input.
    • region -- defaults to us-east-1.
    • subnets -- requires at least 2 across different AZs. For use with Cumulus, these AZs should match the values you configure for your lambda_subnet_ids.
    • max_capacity -- the max ACUs the cluster is allowed to use. Carefully consider cost/performance concerns when setting this value.
    • min_capacity -- the minimum ACUs the cluster will scale to
    • provision_user_database -- Optional flag to allow module to provision a user database in addition to creating the cluster. Described in the next section.

    Provision user and user database

    If you wish for the module to provision a PostgreSQL database on your new cluster and provide a secret for access in the module output, in addition to managing the cluster itself, the following configuration keys are required:

    • provision_user_database -- must be set to true, this configures the module to deploy a lambda that will create the user database, and update the provided configuration on deploy.
    • permissions_boundary_arn -- the permissions boundary to use in creating the roles for access the provisioning lambda will need. This should in most use cases be the same one used for Cumulus Core deployment.
    • rds_user_password -- the value to set the user password to
    • prefix -- this value will be used to set a unique identifier the ProvisionDatabase lambda, as well as name the provisioned user/database.

    Once configured, the module will deploy the lambda, and run it on each provision, creating the configured database if it does not exist, updating the user password if that value has been changed, and updating the output user database secret.

    Setting provision_user_database to false after provisioning will not result in removal of the configured database, as the lambda is non-destructive as configured in this module.

    Please Note: This functionality is limited in that it will only provision a single database/user and configure a basic database, and should not be used in scenarios where more complex configuration is required.

    Initialize Terraform

    Run terraform init

    You should see output like:

    * provider.aws: version = "~> 2.32"

    Terraform has been successfully initialized!

    Deploy

    Run terraform apply to deploy the resources.

    If re-applying this module, variables (e.g. engine_version, snapshot_identifier ) that force a recreation of the database cluster may result in data loss if deletion protection is disabled. Examine the changeset carefully for resources that will be re-created/destroyed before applying.

    Review the changeset, and assuming it looks correct, type yes when prompted to confirm that you want to create all of the resources.

    Assuming the operation is successful, you should see output similar to the following (this example omits the creation of a user database/lambdas/security groups):

    terraform apply

    An execution plan has been generated and is shown below.
    Resource actions are indicated with the following symbols:
    + create

    Terraform will perform the following actions:

    # module.rds_cluster.aws_db_subnet_group.default will be created
    + resource "aws_db_subnet_group" "default" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + subnet_ids = [
    + "subnet-xxxxxxxxx",
    + "subnet-xxxxxxxxx",
    ]
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    }

    # module.rds_cluster.aws_rds_cluster.cumulus will be created
    + resource "aws_rds_cluster" "cumulus" {
    + apply_immediately = true
    + arn = (known after apply)
    + availability_zones = (known after apply)
    + backup_retention_period = 1
    + cluster_identifier = "xxxxxxxxx"
    + cluster_identifier_prefix = (known after apply)
    + cluster_members = (known after apply)
    + cluster_resource_id = (known after apply)
    + copy_tags_to_snapshot = false
    + database_name = "xxxxxxxxx"
    + db_cluster_parameter_group_name = (known after apply)
    + db_subnet_group_name = (known after apply)
    + deletion_protection = true
    + enable_http_endpoint = true
    + endpoint = (known after apply)
    + engine = "aurora-postgresql"
    + engine_mode = "serverless"
    + engine_version = "10.12"
    + final_snapshot_identifier = "xxxxxxxxx"
    + hosted_zone_id = (known after apply)
    + id = (known after apply)
    + kms_key_id = (known after apply)
    + master_password = (sensitive value)
    + master_username = "xxxxxxxxx"
    + port = (known after apply)
    + preferred_backup_window = "07:00-09:00"
    + preferred_maintenance_window = (known after apply)
    + reader_endpoint = (known after apply)
    + skip_final_snapshot = false
    + storage_encrypted = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_security_group_ids = (known after apply)

    + scaling_configuration {
    + auto_pause = true
    + max_capacity = 4
    + min_capacity = 2
    + seconds_until_auto_pause = 300
    + timeout_action = "RollbackCapacityChange"
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret.rds_login will be created
    + resource "aws_secretsmanager_secret" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + name = (known after apply)
    + name_prefix = "xxxxxxxxx"
    + policy = (known after apply)
    + recovery_window_in_days = 30
    + rotation_enabled = (known after apply)
    + rotation_lambda_arn = (known after apply)
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }

    + rotation_rules {
    + automatically_after_days = (known after apply)
    }
    }

    # module.rds_cluster.aws_secretsmanager_secret_version.rds_login will be created
    + resource "aws_secretsmanager_secret_version" "rds_login" {
    + arn = (known after apply)
    + id = (known after apply)
    + secret_id = (known after apply)
    + secret_string = (sensitive value)
    + version_id = (known after apply)
    + version_stages = (known after apply)
    }

    # module.rds_cluster.aws_security_group.rds_cluster_access will be created
    + resource "aws_security_group" "rds_cluster_access" {
    + arn = (known after apply)
    + description = "Managed by Terraform"
    + egress = (known after apply)
    + id = (known after apply)
    + ingress = (known after apply)
    + name = (known after apply)
    + name_prefix = "cumulus_rds_cluster_access_ingress"
    + owner_id = (known after apply)
    + revoke_rules_on_delete = false
    + tags = {
    + "Deployment" = "xxxxxxxxx"
    }
    + vpc_id = "vpc-xxxxxxxxx"
    }

    # module.rds_cluster.aws_security_group_rule.rds_security_group_allow_PostgreSQL will be created
    + resource "aws_security_group_rule" "rds_security_group_allow_postgres" {
    + from_port = 5432
    + id = (known after apply)
    + protocol = "tcp"
    + security_group_id = (known after apply)
    + self = true
    + source_security_group_id = (known after apply)
    + to_port = 5432
    + type = "ingress"
    }

    Plan: 6 to add, 0 to change, 0 to destroy.

    Do you want to perform these actions?
    Terraform will perform the actions described above.
    Only 'yes' will be accepted to approve.

    Enter a value: yes

    module.rds_cluster.aws_db_subnet_group.default: Creating...
    module.rds_cluster.aws_security_group.rds_cluster_access: Creating...
    module.rds_cluster.aws_secretsmanager_secret.rds_login: Creating...

    Then, after the resources are created:

    Apply complete! Resources: X added, 0 changed, 0 destroyed.
    Releasing state lock. This may take a few moments...

    Outputs:

    admin_db_login_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxxxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmdR
    admin_db_login_secret_version = xxxxxxxxx
    rds_endpoint = xxxxxxxxx.us-east-1.rds.amazonaws.com
    security_group_id = xxxxxxxxx
    user_credentials_secret_arn = arn:aws:secretsmanager:us-east-1:xxxxx:secret:xxxxxxxxxx20210407182709367700000002-dpmpXA

    Note the output values for admin_db_login_secret_arn (and optionally user_credentials_secret_arn) as these provide the AWS Secrets Manager secret required to access the database as the administrative user and, optionally, the user database credentials Cumulus requires as well.

    The content of each of these secrets are is in the form:

    {
    "database": "postgres",
    "dbClusterIdentifier": "clusterName",
    "engine": "postgres",
    "host": "xxx",
    "password": "defaultPassword",
    "port": 5432,
    "username": "xxx"
    }
    • database -- the PostgreSQL database used by the configured user
    • dbClusterIdentifier -- the value set by the cluster_identifier variable in the terraform module
    • engine -- the Aurora/RDS database engine
    • host -- the RDS service host for the database in the form (dbClusterIdentifier)-(AWS ID string).(region).rds.amazonaws.com
    • password -- the database password
    • username -- the account username
    • port -- The database connection port, should always be 5432

    Next Steps

    The database cluster has been created/updated! From here you can continue to add additional user accounts, databases and other database configuration.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/share-s3-access-logs/index.html b/docs/v9.9.0/deployment/share-s3-access-logs/index.html index 8d2835b1b29..ab9eae816f6 100644 --- a/docs/v9.9.0/deployment/share-s3-access-logs/index.html +++ b/docs/v9.9.0/deployment/share-s3-access-logs/index.html @@ -5,14 +5,14 @@ Share S3 Access Logs | Cumulus Documentation - +
    Version: v9.9.0

    Share S3 Access Logs

    It is possible through Cumulus to share S3 access logs across multiple S3 packages using the S3 replicator package.

    S3 Replicator

    The S3 Replicator is a node package that contains a simple lambda function, associated permissions, and the Terraform instructions to replicate create-object events from one S3 bucket to another.

    First ensure that you have enabled S3 Server Access Logging.

    Next configure your config.tfvars as described in the s3-replicator/README.md to correspond to your deployment. The source_bucket and source_prefix are determined by how you enabled the S3 Server Access Logging.

    In order to deploy the s3-replicator with cumulus you will need to add the module to your terraform main.tf definition. e.g.

    module "s3-replicator" {
    source = "<path to s3-replicator.zip>"
    prefix = var.prefix
    vpc_id = var.vpc_id
    subnet_ids = var.subnet_ids
    permissions_boundary = var.permissions_boundary_arn
    source_bucket = var.s3_replicator_config.source_bucket
    source_prefix = var.s3_replicator_config.source_prefix
    target_bucket = var.s3_replicator_config.target_bucket
    target_prefix = var.s3_replicator_config.target_prefix
    }

    The terraform source package can be found on the Cumulus github release page under the asset tab terraform-aws-cumulus-s3-replicator.zip.

    ESDIS Metrics

    In the NGAP environment, the ESDIS Metrics team has set up an ELK stack to process logs from Cumulus instances. To use this system, you must deliver any S3 Server Access logs that Cumulus creates.

    Configure the S3 replicator as described above using the target_bucket and target_prefix provided by the metrics team.

    The metrics team has taken care of setting up Logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/terraform-best-practices/index.html b/docs/v9.9.0/deployment/terraform-best-practices/index.html index 20bca7e1362..54b1496b3e9 100644 --- a/docs/v9.9.0/deployment/terraform-best-practices/index.html +++ b/docs/v9.9.0/deployment/terraform-best-practices/index.html @@ -5,7 +5,7 @@ Terraform Best Practices | Cumulus Documentation - + @@ -88,7 +88,7 @@ AWS CLI command, replacing PREFIX with your deployment prefix name:

    aws resourcegroupstaggingapi get-resources \
    --query "ResourceTagMappingList[].ResourceARN" \
    --tag-filters Key=Deployment,Values=PREFIX

    Ideally, the output should be an empty list, but if it is not, then you may need to manually delete the listed resources.

    Configuring the Cumulus deployment: link Restoring a previous version: link

    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/thin_egress_app/index.html b/docs/v9.9.0/deployment/thin_egress_app/index.html index 00941126510..b28921855c8 100644 --- a/docs/v9.9.0/deployment/thin_egress_app/index.html +++ b/docs/v9.9.0/deployment/thin_egress_app/index.html @@ -5,7 +5,7 @@ Using the Thin Egress App for Cumulus distribution | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.9.0

    Using the Thin Egress App for Cumulus distribution

    The Thin Egress App (TEA) is an app running in Lambda that allows retrieving data from S3 using temporary links and provides URS integration.

    Configuring a TEA deployment

    TEA is deployed using Terraform modules. Refer to these instructions for guidance on how to integrate new components with your deployment.

    The cumulus-template-deploy repository cumulus-tf/main.tf contains a thin_egress_app for distribution.

    The TEA module provides these instructions showing how to add it to your deployment and the following are instructions to configure the thin_egress_app module in your Cumulus deployment.

    Create a secret for signing Thin Egress App JWTs

    The Thin Egress App uses JWTs internally to authenticate requests and requires a secret stored in AWS Secrets Manager containing SSH keys that are used to sign the JWTs.

    See the Thin Egress App documentation on how to create this secret with the correct values. It will be used later to set the thin_egress_jwt_secret_name variable when deploying the Cumulus module.

    bucket_map.yaml

    The Thin Egress App uses a bucket_map.yaml file to determine which buckets to serve. Documentation of the file format is available here.

    The default Cumulus module generates a file at s3://${system_bucket}/distribution_bucket_map.json.

    The configuration file is a simple json mapping of the form:

    {
    "daac-public-data-bucket": "/path/to/this/kind/of/data"
    }

    Please note: Cumulus only supports a one-to-one mapping of bucket->TEA path for 'distribution' buckets.

    Optionally configure a custom bucket map

    A simple config would look something like this:

    bucket_map.yaml
    MAP:
    my-protected: my-protected
    my-public: my-public

    PUBLIC_BUCKETS:
    - my-public

    Please note: your custom bucket map must include mappings for all of the protected and public buckets specified in the buckets variable in cumulus-tf/terraform.tfvars, otherwise Cumulus may not be able to determine the correct distribution URL for ingested files and you may encounter errors.

    Optionally configure shared variables

    The cumulus module deploys certain components that interact with TEA. As a result, the cumulus module requires that if you are specifying a value for the stage_name variable to the TEA module, you must use the same value for the tea_api_gateway_stage variable to the cumulus module.

    One way to keep these variable values in sync across the modules is to use Terraform local values to define values to use for the variables for both modules. This approach is shown in the Cumulus core example deployment code.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/deployment/upgrade-readme/index.html b/docs/v9.9.0/deployment/upgrade-readme/index.html index 35f4880c609..055fa278a49 100644 --- a/docs/v9.9.0/deployment/upgrade-readme/index.html +++ b/docs/v9.9.0/deployment/upgrade-readme/index.html @@ -5,7 +5,7 @@ Upgrading Cumulus | Cumulus Documentation - + @@ -15,7 +15,7 @@ deployment functions correctly. Please refer to some recommended smoke tests given above, and consider additional tests appropriate for your particular deployment and environment.

    Update Cumulus Dashboard

    If there are breaking (or otherwise significant) changes to the Cumulus API, you should also upgrade your Cumulus Dashboard deployment to use the version of the Cumulus API matching the version of Cumulus to which you are migrating.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/development/forked-pr/index.html b/docs/v9.9.0/development/forked-pr/index.html index a4f6a92b5a8..9901f434222 100644 --- a/docs/v9.9.0/development/forked-pr/index.html +++ b/docs/v9.9.0/development/forked-pr/index.html @@ -5,13 +5,13 @@ Issuing PR From Forked Repos | Cumulus Documentation - +
    Version: v9.9.0

    Issuing PR From Forked Repos

    Fork the Repo

    • Fork the Cumulus repo
    • Create a new branch from the branch you'd like to contribute to
    • If an issue does't already exist, submit one (see above)

    Create a Pull Request

    Reviewing PRs from Forked Repos

    Upon submission of a pull request, the Cumulus development team will review the code.

    Once the code passes an initial review, the team will run the CI tests against the proposed update.

    The request will then either be merged, declined, or an adjustment to the code will be requested via the issue opened with the original PR request.

    PRs from forked repos cannot directly merged to master. Cumulus reviews must follow the following steps before completing the review process:

    1. Create a new branch:

        git checkout -b from-<name-of-the-branch> master
    2. Push the new branch to GitHub

    3. Change the destination of the forked PR to the new branch that was just pushed

      Screenshot of Github interface showing how to change the base branch of a pull request

    4. After code review and approval, merge the forked PR to the new branch.

    5. Create a PR for the new branch to master.

    6. If the CI tests pass, merge the new branch to master and close the issue. If the CI tests do not pass, request an amended PR from the original author/ or resolve failures as appropriate.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/development/integration-tests/index.html b/docs/v9.9.0/development/integration-tests/index.html index a65badf8af4..cb2768f9c8c 100644 --- a/docs/v9.9.0/development/integration-tests/index.html +++ b/docs/v9.9.0/development/integration-tests/index.html @@ -5,7 +5,7 @@ Integration Tests | Cumulus Documentation - + @@ -19,7 +19,7 @@ in the commit message.

    If you create a new stack and want to be able to run integration tests against it in CI, you will need to add it to bamboo/select-stack.js.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/development/quality-and-coverage/index.html b/docs/v9.9.0/development/quality-and-coverage/index.html index 1e1b1dc1bc1..38ff650978a 100644 --- a/docs/v9.9.0/development/quality-and-coverage/index.html +++ b/docs/v9.9.0/development/quality-and-coverage/index.html @@ -5,7 +5,7 @@ Code Coverage and Quality | Cumulus Documentation - + @@ -23,7 +23,7 @@ here.

    To run linting on the markdown files, run npm run lint-md.

    Audit

    This project uses audit-ci to run a security audit on the package dependency tree. This must pass prior to merge. The configured rules for audit-ci can be found here.

    To execute an audit, run npm run audit.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/development/release/index.html b/docs/v9.9.0/development/release/index.html index 594dc39e17a..2b3f4444f66 100644 --- a/docs/v9.9.0/development/release/index.html +++ b/docs/v9.9.0/development/release/index.html @@ -5,7 +5,7 @@ Versioning and Releases | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.9.0

    Versioning and Releases

    Versioning

    We use a global versioning approach, meaning version numbers in cumulus are consistent across all packages and tasks, and semantic versioning to track major, minor, and patch version (i.e. 1.0.0). We use Lerna to manage our versioning. Any change will force lerna to increment the version of all packages.

    Read more about the semantic versioning here.

    Pre-release testing

    Note: This is only necessary when preparing a release for a new major version of Cumulus (e.g. preparing to go from 6.x.x to 7.0.0)

    Before releasing a new major version of Cumulus, we should test the deployment upgrade path from the latest release of Cumulus to the upcoming release.

    It is preferable to use the cumulus-template-deploy repo for testing the deployment, since that repo is the officially recommended deployment configuration for end users.

    You should create an entirely new deployment for this testing to replicate the end user upgrade path. Using an existing test or CI deployment would not be useful because that deployment may already have been deployed with the latest changes and not match the upgrade path for end users.

    Pre-release testing steps:

    1. Checkout the cumulus-template-deploy repo

    2. Update the deployment code to use the latest release artifacts if it wasn't done already. For example, assuming that the latest release was 5.0.1, update the deployment files as follows:

      # in data-persistence-tf/main.tf
      source = "https://github.com/nasa/cumulus/releases/download/v5.0.1/terraform-aws-cumulus.zip//tf-modules/data-persistence"

      # in cumulus-tf/main.tf
      source = "https://github.com/nasa/cumulus/releases/download/v5.0.1/terraform-aws-cumulus.zip//tf-modules/cumulus"
    3. For both the data-persistence-tf and cumulus-tf modules:

      1. Add the necessary backend configuration (terraform.tf) and variables (terraform.tfvars)
        • You should use an entirely new deployment for this testing, so make sure to use values for key in terraform.tf and prefix in terraform.tfvars that don't collide with existing deployments
      2. Run terraform init
      3. Run terraform apply
    4. Checkout the master branch of the cumulus repo

    5. Run a full bootstrap of the code: npm run bootstrap

    6. Build the pre-release artifacts: ./bamboo/create-release-artifacts.sh

    7. For both the data-persistence-tf and cumulus-tf modules:

      1. Update the deployment to use the built release artifacts:

        # in data-persistence-tf/main.tf
        source = "[path]/cumulus/terraform-aws-cumulus.zip//tf-modules/data-persistence"

        # in cumulus-tf/main.tf
        source = "/Users/mboyd/development/cumulus/terraform-aws-cumulus.zip//tf-modules/cumulus"
      2. Review the CHANGELOG.md for any pre-deployment migration steps. If there are, go through the steps and confirm that they are successful

      3. Run terraform init

      4. Run terraform apply

    8. Review the CHANGELOG.md for any post-deployment migration steps and confirm that they are successful

    9. Delete your test deployment by running terraform destroy in cumulus-tf and data-persistence-tf

    Updating Cumulus version and publishing to NPM

    1. Create a branch for the new release

    From Master

    Create a branch titled release-MAJOR.MINOR.x for the release (use a literal x for the patch version).

        git checkout -b release-MAJOR.MINOR.x

    e.g.:
    git checkout -b release-9.1.x

    If creating a new major version release from master, say 5.0.0, then the branch would be named release-5.0.x. If creating a new minor version release from master, say 1.14.0 then the branch would be named release-1.14.x.

    Having a release branch for each major/minor version allows us to easily backport patches to that version.

    Push the release-MAJOR.MINOR.x branch to GitHub if it was created locally. (Commits should be even with master at this point.)

    If creating a patch release, you can check out the existing base branch.

    Then create the release branch (e.g. release-1.14.0) from the minor version base branch. For example, from the release-1.14.x branch:

    git checkout -b release-1.14.0

    Backporting

    When creating a backport, a minor version base branch should already exist on GitHub. Check out the existing minor version base branch then create a release branch from it. For example:

    # check out existing minor version base branch
    git checkout release-1.14.x
    # pull to ensure you have the latest changes
    git pull origin release-1.14.x
    # create new release branch for backport
    git checkout -b release-1.14.1
    # cherry pick the commits (or single squashed commit of changes) relevant to the backport
    git cherry-pick [replace-with-commit-SHA]
    # push up the changes to the release branch
    git push

    2. Update the Cumulus version number

    When changes are ready to be released, the Cumulus version number must be updated.

    Lerna handles the process of deciding which version number should be used as long as the developer specifies whether the change is a major, minor, or patch change.

    To update Cumulus's version number run:

    npm run update

    Screenshot of terminal showing interactive prompt from Lerna for selecting the new release version

    Lerna will handle updating the packages and all of the dependent package version numbers. If a dependency has not been changed with the update, however, lerna will not update the version of the dependency.

    Note: Lerna will struggle to correctly update the versions on any non-standard/alpha versions (e.g. 1.17.0-alpha0). Please be sure to check any packages that are new or have been manually published since the previous release and any packages that list it as a dependency to ensure the listed versions are correct. It's useful to use the search feature of your code editor or grep to see if there any references to outdated package versions.

    3. Check Cumulus Dashboard PRs for Version Bump

    There may be unreleased changes in the Cumulus Dashboard project that rely on this unreleased Cumulus Core version.

    If there is exists a PR in the cumulus-dashboard repo with a name containing: "Version Bump for Next Cumulus API Release":

    • There will be a placeholder change-me value that should be replaced with the Cumulus Core to-be-released-version.
    • Mark that PR as ready to be reviewed.

    4. Update CHANGELOG.md

    Update the CHANGELOG.md. Put a header under the Unreleased section with the new version number and the date.

    Add a link reference for the github "compare" view at the bottom of the CHANGELOG.md, following the existing pattern. This link reference should create a link in the CHANGELOG's release header to changes in the corresponding release.

    5. Update DATA_MODEL_CHANGELOG.md

    Similar to #4, make sure the DATA_MODEL_CHANGELOG is updated if there are data model changes in the release, and the link reference at the end of the document is updated as appropriate.

    6. Update CONTRIBUTORS.md

    ./bin/update-contributors.sh
    git add CONTRIBUTORS.md

    Commit and push these changes, if any.

    7. Update Cumulus package API documentation

    Update auto-generated API documentation for any Cumulus packages that have it:

    npm run docs-build-packages

    Commit and push these changes, if any.

    8. Cut new version of Cumulus Documentation

    If this is a backport, do not create a new version of the documentation. For various reasons, we do not merge backports back to master, other than changelog notes. Documentation changes for backports will not be published to our documentation website.

    cd website
    npm run version ${release_version}
    git add .

    Where ${release_version} corresponds to the version tag v1.2.3, for example.

    Commit and push these changes.

    9. Create a pull request against the minor version branch

    1. Push the release branch (e.g. release-1.2.3) to GitHub.

    2. Create a PR against the minor version base branch (e.g. release-1.2.x).

    3. Configure Bamboo to run automated tests against this PR by finding the branch plan for the release branch (release-1.2.3) and setting only these variables:

      • GIT_PR: true
      • SKIP_AUDIT: true

      IMPORTANT: Do NOT set the PUBLISH_FLAG variable to true for this branch plan. The actual publishing of the release will be handled by a separate, manually triggered branch plan.

      Screenshot of Bamboo CI interface showing the configuration of the GIT_PR branch variable to have a value of &quot;true&quot;

    4. Verify that the Bamboo build for the PR succeeds and then merge to the minor version base branch (release-1.2.x).

      • It is safe to do a squash merge in this instance, but not required
    5. You may delete your release branch (release-1.2.3) after merging to the base branch.

    10. Create a git tag for the release

    Check out the minor version base branch now that your changes are merged in and do a git pull.

    Ensure you are on the latest commit.

    Create and push a new git tag:

        git tag -a vMAJOR.MINOR.PATCH -m "Release MAJOR.MINOR.PATCH"
    git push origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -a v9.1.0 -m "Release 9.1.0"
    git push origin v9.1.0

    11. Publishing the release

    Publishing of new releases is handled by a custom Bamboo branch plan and is manually triggered.

    The reasons for using a separate branch plan to handle releases instead of the branch plan for the minor version (e.g. release-1.2.x) are:

    • The Bamboo build for the minor version release branch is triggered automatically on any commits to that branch, whereas we want to manually control when the release is published.
    • We want to verify that integration tests have passed on the Bamboo build for the minor version release branch before we manually trigger the release, so that we can be sure that our code is safe to release.

    If this is a new minor version branch, then you will need to create a new Bamboo branch plan for publishing the release following the instructions below:

    Creating a Bamboo branch plan for the release

    • In the Cumulus Core project (https://ci.earthdata.nasa.gov/browse/CUM-CBA), click Actions -> Configure Plan in the top right.

    • Next to Plan branch click the rightmost button that displays Create Plan Branch upon hover.

    • Click Create plan branch manually.

    • Add the values in that list. Choose a display name that makes it very clear this is a deployment branch plan. Release (minor version branch name) seems to work well (e.g. Release (1.2.x))).

      • Make sure you enter the correct branch name (e.g. release-1.2.x).
    • Important Deselect Enable Branch - if you do not do this, it will immediately fire off a build.

    • Do Immediately On the Branch Details page, enable Change trigger. Set the Trigger type to manual, this will prevent commits to the branch from triggering the build plan. You should have been redirected to the Branch Details tab after creating the plan. If not, navigate to the branch from the list where you clicked Create Plan Branch in the previous step.

    • Go to the Variables tab. Ensure that you are on your branch plan and not the master plan: You should not see a large list of configured variables, but instead a dropdown allowing you to select variables to override, and the tab title will be Branch Variables. Then set the branch variables as follow:

      • DEPLOYMENT: cumulus-from-npm-tf (except in special cases such as incompatible backport branches)
        • If this variable is not set, it will default to the deployment name for the last committer on the branch
      • USE_CACHED_BOOTSTRAP: false
      • USE_TERRAFORM_ZIPS: true (IMPORTANT: MUST be set in order to run integration tests against the .zip files published during the build so that we are actually testing our released files)
      • GIT_PR: true
      • SKIP_AUDIT: true
      • PUBLISH_FLAG: true
    • Enable the branch from the Branch Details page.

    • Run the branch using the Run button in the top right.

    Bamboo will build and run lint, audit and unit tests against that tagged release, publish the new packages to NPM, and then run the integration tests using those newly released packages.

    12. Create a new Cumulus release on github

    The CI release scripts will automatically create a GitHub release based on the release version tag, as well as upload artifacts to the Github release for the Terraform modules provided by Cumulus. The Terraform release artifacts include:

    • A multi-module Terraform .zip artifact containing filtered copies of the tf-modules, packages, and tasks directories for use as Terraform module sources.
    • A S3 replicator module
    • A workflow module
    • A distribution API module
    • An ECS service module

    Just make sure to verify the appropriate .zip files are present on Github after the release process is complete.

    13. Merge base branch back to master

    Finally, you need to reproduce the version update changes back to master.

    If this is the latest version, you can simply create a PR to merge the minor version base branch back to master.

    Do not merge master back into the release branch since we want the release branch to just have the code from the release. Instead, create a new branch off of the release branch and merge that to master. You can freely merge master into this branch and delete it when it is merged to master.

    If this is a backport, you will need to create a PR that ports the changelog updates back to master. It is important in this changelog note to call it out as a backport. For example, fixes in backport version 1.14.5 may not be available in 1.15.0 because the fix was introduced in 1.15.3.

    Troubleshooting

    Delete and regenerate the tag

    To delete a published tag to re-tag, follow these steps:

      git tag -d vMAJOR.MINOR.PATCH
    git push -d origin vMAJOR.MINOR.PATCH

    e.g.:
    git tag -d v9.1.0
    git push -d origin v9.1.0
    - + \ No newline at end of file diff --git a/docs/v9.9.0/docs-how-to/index.html b/docs/v9.9.0/docs-how-to/index.html index 5835321aeab..14f478ebe90 100644 --- a/docs/v9.9.0/docs-how-to/index.html +++ b/docs/v9.9.0/docs-how-to/index.html @@ -5,13 +5,13 @@ Cumulus Documentation: How To's | Cumulus Documentation - +
    Version: v9.9.0

    Cumulus Documentation: How To's

    Cumulus Docs Installation

    Run a Local Server

    Environment variables DOCSEARCH_API_KEY and DOCSEARCH_INDEX_NAME must be set for search to work. At the moment, search is only truly functional on prod because that is the only website we have registered to be indexed with DocSearch (see below on search).

    git clone git@github.com:nasa/cumulus
    cd cumulus
    npm run docs-install
    npm run docs-serve

    Note: docs-build will build the documents into website/build.

    Cumulus Documentation

    Our project documentation is hosted on GitHub Pages. The resources published to this website are housed in docs/ directory at the top of the Cumulus repository. Those resources primarily consist of markdown files and images.

    We use the open-source static website generator Docusaurus to build html files from our markdown documentation, add some organization and navigation, and provide some other niceties in the final website (search, easy templating, etc.).

    Add a New Page and Sidebars

    Adding a new page should be as simple as writing some documentation in markdown, placing it under the correct directory in the docs/ folder and adding some configuration values wrapped by --- at the top of the file. There are many files that already have this header which can be used as reference.

    ---
    id: doc-unique-id # unique id for this document. This must be unique across ALL documentation under docs/
    title: Title Of Doc # Whatever title you feel like adding. This will show up as the index to this page on the sidebar.
    hide_title: false
    ---

    Note: To have the new page show up in a sidebar the designated id must be added to a sidebar in the website/sidebars.js file. Docusaurus has an in depth explanation of sidebars here.

    Versioning Docs

    We lean heavily on Docusaurus for versioning. Their suggestions and walkthrough can be found here. It is worth noting that we would like the Documentation versions to match up directly with release versions. Cumulus versioning is explained in the Versioning Docs.

    Search on our documentation site is taken care of by DocSearch. We have been provided with an apiKey and an indexName by DocSearch that we include in our website/siteConfig.js file. The rest, indexing and actual searching, we leave to DocSearch. Our builds expect environment variables for both these values to exist - DOCSEARCH_API_KEY and DOCSEARCH_NAME_INDEX.

    Add a new task

    The tasks list in docs/tasks.md is generated from the list of task package in the task folder. Do not edit the docs/tasks.md file directly.

    Read more about adding a new task.

    Editing the tasks.md header or template

    Look at the bin/build-tasks-doc.js and bin/tasks-header.md files to edit the output of the tasks build script.

    Editing diagrams

    For some diagrams included in the documentation, the raw source is included in the docs/assets/raw directory to allow for easy updating in the future:

    • assets/interfaces.svg -> assets/raw/interfaces.drawio (generated using draw.io)

    Deployment

    The master branch is automatically built and deployed to gh-pages branch. The gh-pages branch is served by Github Pages. Do not make edits to the gh-pages branch.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/external-contributions/index.html b/docs/v9.9.0/external-contributions/index.html index c5ed9752a82..ed2f638a06c 100644 --- a/docs/v9.9.0/external-contributions/index.html +++ b/docs/v9.9.0/external-contributions/index.html @@ -5,13 +5,13 @@ External Contributions | Cumulus Documentation - +
    Version: v9.9.0

    External Contributions

    Contributions to Cumulus may be made in the form of PRs to the repositories directly or through externally developed tasks and components. Cumulus is designed as an ecosystem that leverages Terraform deployments and AWS Step Functions to easily integrate external components.

    This list may not be exhaustive and represents components that are open source, owned externally, and that have been tested with the Cumulus system. For more information and contributing guidelines, visit the respective GitHub repositories.

    Distribution

    The ASF Thin Egress App is used by Cumulus for distribution. TEA can be deployed with Cumulus or as part of other applications to distribute data.

    Operational Cloud Recovery Archive (ORCA)

    ORCA can be deployed with Cumulus to provide a customizable baseline for creating and managing operational backups.

    Workflow Tasks

    CNM

    PO.DAAC provides two workflow tasks to be used with the Cloud Notification Mechanism (CNM) Schema: CNM to Granule and CNM Response.

    See the CNM workflow data cookbook for an example of how these can be used in a Cumulus ingest workflow.

    DMR++ Generation

    GHRC has provided a DMR++ Generation wokrflow task. This task is meant to be used in conjunction with Cumulus' Hyrax Metadata Updates workflow task.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/faqs/index.html b/docs/v9.9.0/faqs/index.html index cb9a913f765..1121e82a8e9 100644 --- a/docs/v9.9.0/faqs/index.html +++ b/docs/v9.9.0/faqs/index.html @@ -5,13 +5,13 @@ Frequently Asked Questions | Cumulus Documentation - +
    Version: v9.9.0

    Frequently Asked Questions

    Below are some commonly asked questions that you may encounter that can assist you along the way when working with Cumulus.

    General

    How do I deploy a new instance in Cumulus?

    Answer: For steps on the Cumulus deployment process go to How to Deploy Cumulus.

    What prerequisites are needed to setup Cumulus?

    Answer: You will need access to the AWS console and an Earthdata login before you can deploy Cumulus.

    What is the preferred web browser for the Cumulus environment?

    Answer: Our preferred web browser is the latest version of Google Chrome.

    How do I quickly troubleshoot an issue in Cumulus?

    Answer: To troubleshoot and fix issues in Cumulus reference our recommended solutions in Troubleshooting Cumulus.

    Where can I get support help?

    Answer: The following options are available for assistance:

    • Cumulus: Outside NASA users should file a GitHub issue and inside NASA users should file a JIRA issue.
    • AWS: You can create a case in the AWS Support Center, accessible via your AWS Console.

    Integrators & Developers

    What is a Cumulus integrator?

    Answer: Those who are working within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    What are the steps if I run into an issue during deployment?

    Answer: If you encounter an issue with your deployment go to the Troubleshooting Deployment guide.

    Is Cumulus customizable and flexible?

    Answer: Yes. Cumulus is a modular architecture that allows you to decide which components that you want/need to deploy. These components are maintained as Terraform modules.

    What are Terraform modules?

    Answer: They are modules that are composed to create a Cumulus deployment, which gives integrators the flexibility to choose the components of Cumulus that want/need. To view Cumulus maintained modules or steps on how to create a module go to Terraform modules.

    Where do I find Terraform module variables

    Answer: Go here for a list of Cumulus maintained variables.

    What is a Cumulus workflow?

    Answer: A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions. For more details, we suggest visiting here.

    How do I set up a Cumulus workflow?

    Answer: You will need to create a provider, have an associated collection (add a new one), and generate a new rule first. Then you can set up a Cumulus workflow by following these steps here.

    What are the common use cases that a Cumulus integrator encounters?

    Answer: The following are some examples of possible use cases you may see:


    Operators

    What is a Cumulus operator?

    Answer: Those that ingests, archives, and troubleshoots datasets (called collections in Cumulus). Your daily activities might include but not limited to the following:

    • Ingesting datasets
    • Maintaining historical data ingest
    • Starting and stopping data handlers
    • Managing collections
    • Managing provider definitions
    • Creating, enabling, and disabling rules
    • Investigating errors for granules and deleting or re-ingesting granules
    • Investigating errors in executions and isolating failed workflow step(s)
    What are the common use cases that a Cumulus operator encounters?

    Answer: The following are some examples of possible use cases you may see:

    Can you re-run a workflow execution in AWS?

    Answer: Yes. For steps on how to re-run a workflow execution go to Re-running workflow executions in the Cumulus Operator Docs.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/ancillary_metadata/index.html b/docs/v9.9.0/features/ancillary_metadata/index.html index 0764f30a60b..2c6e4e91f5e 100644 --- a/docs/v9.9.0/features/ancillary_metadata/index.html +++ b/docs/v9.9.0/features/ancillary_metadata/index.html @@ -5,7 +5,7 @@ Ancillary Metadata Export | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.9.0

    Ancillary Metadata Export

    This feature utilizes the type key on a files object in a Cumulus granule. It uses the key to provide a mechanism where granule discovery, processing and other tasks can set and use this value to facilitate metadata export to CMR.

    Tasks setting type

    Discover Granules

    Uses the Collection type key to set the value for files on discovered granules in it's output.

    Parse PDR

    Uses a task-specific mapping to map PDR 'FILE_TYPE' to a CNM type to set type on granules from the PDR.

    CNMToCMALambdaFunction

    Natively supports types that are included in incoming messages to a CNM Workflow.

    Tasks using type

    Move Granules

    Uses the granule file type key to update UMM/ECHO 10 CMR files passed in as candidates to the task. This task adds the external facing URLs to the CMR metadata file based on the type. See the file tracking data cookbook for a detailed mapping. If a non-CNM type is specified, the task assumes it is a 'data' file.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/backup_and_restore/index.html b/docs/v9.9.0/features/backup_and_restore/index.html index 8f4815886b6..61c99c1fe7d 100644 --- a/docs/v9.9.0/features/backup_and_restore/index.html +++ b/docs/v9.9.0/features/backup_and_restore/index.html @@ -5,7 +5,7 @@ Cumulus Backup and Restore | Cumulus Documentation - + @@ -71,7 +71,7 @@ utilize the new cluster/security groups and redeploy.

    DynamoDB

    Backup and Restore with AWS

    You can enable point-in-time recovery (PITR) as well as create an on-demand backup for your Amazon DynamoDB tables.

    PITR provides continuous backups of your DynamoDB table data. PITR can be enabled through your Terraform deployment, the AWS console, or the AWS API. When enabled, DynamoDB maintains continuous backups of your table up to the last 35 days. You can recover a copy of that table to a previous state at any point in time from the moment you enable PITR, up to a maximum of the 35 preceding days. PITR provides continuous backups until you explicitly disable it.

    On-demand backups allow you to create backups of DynamoDB table data and its settings. You can initiate an on-demand backup at any time with a single click from the AWS Management Console or a single API call. You can restore the backups to a new DynamoDB table in the same AWS Region at any time.

    PITR gives your DynamoDB tables continuous protection from accidental writes and deletes. With PITR, you do not have to worry about creating, maintaining, or scheduling backups. You enable PITR on your table and your backup is available for restore at any point in time from the moment you enable it, up to a maximum of the 35 preceding days. For example, imagine a test script writing accidentally to a production DynamoDB table. You could recover your table to any point in time within the last 35 days.

    On-demand backups help with long-term archival requirements for regulatory compliance. On-demand backups give you full-control of managing the lifecycle of your backups, from creating as many backups as you need to retaining these for as long as you need.

    Enabling PITR during deployment

    By default, the Cumulus data-persistence module enables PITR on the default tables listed in the module's variable defaults for enable_point_in_time_tables. At the time of writing, that list includes:

    • AsyncOperationsTable
    • CollectionsTable
    • ExecutionsTable
    • FilesTable
    • GranulesTable
    • PdrsTable
    • ProvidersTable
    • RulesTable

    If you wish to change this list, simply update your deployment's data_persistence module (here in the template-deploy repository) to pass the correct list of tables.

    Restoring with PITR

    Restoring a full deployment

    If your deployment has been deleted all of your tables with PITR enabled will have had backups created automatically. You can locate these backups in the AWS console in the DynamoDb Backups Page or through the CLI by running:

    aws dynamodb list-backups --backup-type SYSTEM

    You can restore your tables to your AWS account using the following command:

    aws dynamodb restore-table-from-backup --target-table-name <prefix>-CollectionsTable --backup-arn <backup-arn>

    Where prefix matches the prefix from your data-persistence deployment. backup-arn can be found in the AWS console or by listing the backups using the command above.

    This will restore your tables to AWS. They will need to be linked to your Terraform deployment. After terraform init and before terraform apply, run the following command for each table:

    terraform import module.data_persistence.aws_dynamodb_table.collections_table <prefix>-CollectionsTable

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Terraform will now manage these tables as part of the Terraform state. Run terrform apply to generate the rest of the data-persistence deployment and then follow the instructions to deploy the cumulus deployment as normal.

    At this point the data will be in DynamoDB, but not in Elasticsearch, so nothing will be returned on the Operator dashboard or through Operator API calls. To get the data into Elasticsearch, run an index-from-database operation via the Operator API. The status of this operation can be viewed on the dashboard. When Elasticsearch is switched to the recovery index the data will be visible on the dashboard and available via the Operator API.

    Restoring an individual table

    A table can be restored to a previous state using PITR. This is easily achievable via the AWS Console by visiting the Backups tab for the table.

    A table can only be recovered to a new table name. Following the restoration of the table, the new table must be imported into Terraform.

    First, remove the old table from the Terraform state:

    terraform state rm module.data_persistence.aws_dynamodb_table.collections_table

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Then import the new table into the Terraform state:

    terraform import module.data_persistence.aws_dynamodb_table.collections_table <new-table-name>

    replacing collections_table with the table identifier in the DynamoDB Terraform table definitions.

    Your data-persistence and cumulus deployments should be redeployed so that your instance of Cumulus uses this new table. After the deployment, your Elasticsearch instance will be out of sync with your new table if there is any change in data. To resync your Elasticsearch with your database run an index-from-database operation via the Operator API. The status of this operation can be viewed on the dashboard. When Elasticsearch is switched to the new index the DynamoDB tables and Elasticsearch instance will be in sync and the correct data will be reflected on the dashboard.

    Backup and Restore with cumulus-api CLI

    cumulus-api CLI also includes a backup and restore command. The CLI backup command downloads the content of any of your DynamoDB tables to .json files. You can also use these .json files to restore the records to another DynamoDB table.

    Backup with the CLI

    To backup a table with the CLI, install the @cumulus/api package using npm, making sure to install the same version as your Cumulus deployment:

    npm install -g @cumulus/api@version

    Then run:

    cumulus-api backup --table <table-name>

    the backup will be stored at backups/<table-name>.json

    Restore with the CLI

    To restore data from a json file run the following command:

    cumulus-api restore backups/<table-name>.json --table <table-name>

    The restore can go to the in-use table and will update Elasticsearch. If an existing record exists in the table it will not be duplicated but will be updated with the record from the restore file.

    Data Backup and Restore

    Cumulus provides no core functionality to backup data stored in S3. Data disaster recovery is being developed in a separate effort here.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/data_in_dynamodb/index.html b/docs/v9.9.0/features/data_in_dynamodb/index.html index 7a8140db0e3..684db7a6e6a 100644 --- a/docs/v9.9.0/features/data_in_dynamodb/index.html +++ b/docs/v9.9.0/features/data_in_dynamodb/index.html @@ -5,13 +5,13 @@ Cumulus Metadata in DynamoDB | Cumulus Documentation - +
    Version: v9.9.0

    Cumulus Metadata in DynamoDB

    @cumulus/api uses a number of methods to preserve the metadata generated in a Cumulus instance.

    All configurations and system-generated metadata is stored in DynamoDB tables except the logs. System logs are stored in the AWS CloudWatch service.

    Amazon DynamoDB stores three geographically distributed replicas of each table to enable high availability and data durability. Amazon DynamoDB runs exclusively on solid-state drives (SSDs). SSDs help AWS achieve the design goals of predictable low-latency response times for storing and accessing data at any scale.

    DynamoDB Auto Scaling

    Cumulus deployed tables from the data-persistence module are set to on-demand mode.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/dead_letter_archive/index.html b/docs/v9.9.0/features/dead_letter_archive/index.html index 242bd51bd58..451749bf5bd 100644 --- a/docs/v9.9.0/features/dead_letter_archive/index.html +++ b/docs/v9.9.0/features/dead_letter_archive/index.html @@ -5,13 +5,13 @@ Cumulus Dead Letter Archive | Cumulus Documentation - +
    Version: v9.9.0

    Cumulus Dead Letter Archive

    This documentation explains the Cumulus dead letter archive and associated functionality.

    DB Records DLQ Archive

    The Cumulus system contains a number of dead letter queues. Perhaps the most important system lambda function supported by a DLQ is the sfEventSqsToDbRecords lambda function which parses Cumulus messages from workflow executions to generate and write database records to the Cumulus database.

    As of Cumulus v9+, the dead letter queue for this lambda (named sfEventSqsToDbRecordsDeadLetterQueue) has been updated with a consumer lambda that will automatically write any incoming records to the S3 system bucket, under the path <stackName>/dead-letter-archive/sqs/. This will allow integrators and operators engaged in debugging missing records to inspect any Cumulus messages which failed to process and did not result in the successful creation of database records.

    Dead Letter Archive recovery

    In addition to the above, as of Cumulus v9+, the Cumulus API also contains a new endpoint at /deadLetterArchive/recoverCumulusMessages.

    Sending a POST request to this endpoint will trigger a Cumulus AsyncOperation that will attempt to reprocess (and if successful delete) all Cumulus messages in the dead letter archive, using the same underlying logic as the existing sfEventSqsToDbRecords.

    This endpoint may prove particularly useful when recovering from extended or unexpected database outage, where messages failed to process due to external outage and there is no essential malformation of each Cumulus message.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/dead_letter_queues/index.html b/docs/v9.9.0/features/dead_letter_queues/index.html index 80c444ad703..80c58192254 100644 --- a/docs/v9.9.0/features/dead_letter_queues/index.html +++ b/docs/v9.9.0/features/dead_letter_queues/index.html @@ -5,13 +5,13 @@ Dead Letter Queues | Cumulus Documentation - +
    Version: v9.9.0

    Dead Letter Queues

    startSF SQS queue

    The workflow-trigger for the startSF queue has a Redrive Policy set up that directs any failed attempts to pull from the workflow start queue to a SQS queue Dead Letter Queue.

    This queue can then be monitored for failures to initiate a workflow. Please note that workflow failures will not show up in this queue, only repeated failure to trigger a workflow.

    Named Lambda Dead Letter Queues

    Cumulus provides configured Dead Letter Queues (DLQ) for non-workflow Lambdas (such as ScheduleSF) to capture Lambda failures for further processing.

    These DLQs are setup with the following configuration:

      receive_wait_time_seconds  = 20
    message_retention_seconds = 1209600
    visibility_timeout_seconds = 60

    Default Lambda Configuration

    The following built-in Cumulus Lambdas are setup with DLQs to allow handling of process failures:

    • dbIndexer (Updates Elasticsearch based on DynamoDB events)
    • JobsLambda (writes logs outputs to Elasticsearch)
    • ScheduleSF (the SF Scheduler Lambda that places messages on the queue that is used to start workflows, see Workflow Triggers)
    • publishReports (Lambda that publishes messages to the SNS topics for execution, granule and PDR reporting)
    • reportGranules, reportExecutions, reportPdrs (Lambdas responsible for updating records based on messages in the queues published by publishReports)

    Troubleshooting/Utilizing messages in a Dead Letter Queue

    Ideally an automated process should be configured to poll the queue and process messages off a dead letter queue.

    For aid in manually troubleshooting, you can utilize the SQS Management console to view/messages available in the queues setup for a particular stack. The dead letter queues will have a Message Body containing the Lambda payload, as well as Message Attributes that reference both the error returned and a RequestID which can be cross referenced to the associated Lambda's CloudWatch logs for more information:

    Screenshot of the AWS SQS console showing how to view SQS message attributes

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/distribution-metrics/index.html b/docs/v9.9.0/features/distribution-metrics/index.html index 558c41d8af2..34ee5fe6c21 100644 --- a/docs/v9.9.0/features/distribution-metrics/index.html +++ b/docs/v9.9.0/features/distribution-metrics/index.html @@ -5,13 +5,13 @@ Cumulus Distribution Metrics | Cumulus Documentation - +
    Version: v9.9.0

    Cumulus Distribution Metrics

    It is possible to configure Cumulus and the Cumulus Dashboard to display information about the successes and failures of requests for data. This requires the Cumulus instance to deliver Cloudwatch Logs and S3 Server Access logs to an ELK stack.

    ESDIS Metrics in NGAP

    Work with the ESDIS metrics team to set up permissions and access to forward Cloudwatch Logs to a shared AWS:Logs:Destination as well as transferring your S3 Server Access logs to a metrics team bucket.

    The metrics team has taken care of setting up logstash to ingest the files that get delivered to their bucket into their Elasticsearch instance.

    Once Cumulus has been configured to deliver Cloudwatch logs to the ESDIS Metrics team, you can use the Elasticsearch indexes to create the necessary target patterns on the dashboard. These are often <daac>-cloudwatch-cumulus-<env>-* and <daac>-distribution-<env>-*, but they will depend on your specific Elastiscearch setup.

    Cumulus / ESDIS Metrics distribution system

    Architecture diagram showing how logs are replicated from a Cumulus instance to the ESDIS Metrics account and accessed by the Cumulus dashboard

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/execution_payload_retention/index.html b/docs/v9.9.0/features/execution_payload_retention/index.html index 39f955f3155..c0838e9c0c9 100644 --- a/docs/v9.9.0/features/execution_payload_retention/index.html +++ b/docs/v9.9.0/features/execution_payload_retention/index.html @@ -5,13 +5,13 @@ Execution Payload Retention | Cumulus Documentation - +
    Version: v9.9.0

    Execution Payload Retention

    In addition to CloudWatch logs and AWS StepFunction API records, Cumulus automatically stores the initial and 'final' (the last update to the execution record) payload values as part of the Execution record in DynamoDB and Elasticsearch.

    This allows access via the API (or optionally direct DB/Elasticsearch querying) for debugging/reporting purposes. The data is stored in the "originalPayload" and "finalPayload" fields.

    Payload record cleanup

    To reduce storage requirements, a CloudWatch rule ({stack-name}-dailyExecutionPayloadCleanupRule) triggering a daily run of the provided cleanExecutions lambda has been added. This lambda will remove all 'completed' and 'non-completed' payload records in the database that are older than the specified configuration.

    Configuration

    The following configuration flags have been made available in the cumulus module. They may be overridden in your deployment's instance of the cumulus module by adding the following configuration options:

    dailyexecution_payload_cleanup_schedule_expression (string)_

    This configuration option sets the execution times for this Lambda to run, using a Cloudwatch cron expression.

    Default value is "cron(0 4 * * ? *)".

    completeexecution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of completed execution payloads.

    Default value is false.

    completeexecution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a 'completed' status in days. Records with updatedAt values older than this with payload information will have that information removed.

    Default value is 10.

    noncomplete_execution_payload_timeout_disable (bool)_

    This configuration option, when set to true, will disable all cleanup of "non-complete" (any status other than completed) execution payloads.

    Default value is false.

    noncomplete_execution_payload_timeout (number)_

    This flag defines the cleanup threshold for executions with a status other than 'complete' in days. Records with updateTime values older than this with payload information will have that information removed.

    Default value is 30 days.

    • complete_execution_payload_disable/non_complete_execution_payload_disable

    These flags (true/false) determine if the cleanup script's logic for 'complete' and 'non-complete' executions will run. Default value is false for both.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/logging-esdis-metrics/index.html b/docs/v9.9.0/features/logging-esdis-metrics/index.html index f374e1bba7b..a417cf0a845 100644 --- a/docs/v9.9.0/features/logging-esdis-metrics/index.html +++ b/docs/v9.9.0/features/logging-esdis-metrics/index.html @@ -5,13 +5,13 @@ Writing logs for ESDIS Metrics | Cumulus Documentation - +
    Version: v9.9.0

    Writing logs for ESDIS Metrics

    Note: This feature is only available for Cumulus deployments in NGAP environments.

    Prerequisite: You must configure your Cumulus deployment to deliver your logs to the correct shared logs destination for ESDIS metrics.

    Log messages delivered to the ESDIS metrics logs destination conforming to an expected format will be automatically ingested and parsed to enable helpful searching/filtering of your logs via the ESDIS metrics Kibana dashboard.

    Expected log format

    The ESDIS metrics pipeline expects a log message to be a JSON string representation of an object (dict in Python or map in Java). An example log message might look like:

    {
    "level": "info",
    "executions": "arn:aws:states:us-east-1:000000000000:execution:MySfn:abcd1234",
    "granules": "[\"granule-1\",\"granule-2\"]",
    "message": "hello world",
    "sender": "greetingFunction",
    "stackName": "myCumulus",
    "timestamp": "2018-10-19T19:12:47.501Z"
    }

    A log message can contain the following properties:

    • executions: The AWS Step Function execution name in which this task is executing, if any
    • granules: A JSON string of the array of granule IDs being processed by this code, if any
    • level: A string identifier for the type of message being logged. Possible values:
      • debug
      • error
      • fatal
      • info
      • warn
      • trace
    • message: String containing your actual log message
    • parentArn: The parent AWS Step Function execution ARN that triggered the current execution, if any
    • sender: The name of the resource generating the log message (e.g. a library name, a Lambda function name, an ECS activity name)
    • stackName: The unique prefix for your Cumulus deployment
    • timestamp: An ISO-8601 formatted timestamp
    • version: The version of the resource generating the log message, if any

    None of these properties are explicitly required for ESDIS metrics to parse your log correctly. However, a log without a message has no informational content. And having level, sender, and timestamp properties is very useful for filtering your logs. Including a stackName in your logs is helpful as it allows you to distinguish between logs generated by different deployments.

    Using Cumulus Message Adapter libraries

    If you are writing a custom task that is integrated with the Cumulus Message Adapter, then some of language specific client libraries can be used to write logs compatible with ESDIS metrics.

    The usage of each library differs slightly, but in general a logger is initialized with a Cumulus workflow message to determine the contextual information for the task (e.g. granules, executions). Then, after the logger is initialized, writing logs only requires specifying a message, but the logged output will include the contextual information as well.

    Writing logs using custom code

    Any code that produces logs matching the expected log format can be processed by ESDIS metrics.

    Node.js

    Cumulus core provides a @cumulus/logger library that writes logs in the expected format for ESDIS metrics.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/replay-archived-sqs-messages/index.html b/docs/v9.9.0/features/replay-archived-sqs-messages/index.html index 9cd56f0e16c..efadde45b5a 100644 --- a/docs/v9.9.0/features/replay-archived-sqs-messages/index.html +++ b/docs/v9.9.0/features/replay-archived-sqs-messages/index.html @@ -5,14 +5,14 @@ How to replay SQS messages archived in S3 | Cumulus Documentation - +
    Version: v9.9.0

    How to replay SQS messages archived in S3

    Context

    Cumulus archives all incoming SQS messages to S3 and removes messages once they have been processed. Unprocessed messages are archived at the path: ${stackName}/archived-incoming-messages/${queueName}/${messageId}

    Replay SQS messages endpoint

    The Cumulus API has added a new endpoint, /replays/sqs. This endpoint will allow you to start a replay operation to requeue all archived SQS messages by queueName and returns an AsyncOperationId for operation status tracking.

    Start replaying archived SQS messages

    In order to start a replay, you must perform a POST request to the replays/sqs endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    FieldTypeDescription
    queueNamestringAny valid SQS queue name (not ARN)

    Status tracking

    A successful response from the /replays/sqs endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/replay-kinesis-messages/index.html b/docs/v9.9.0/features/replay-kinesis-messages/index.html index ce5a777fddc..1a676b9f95c 100644 --- a/docs/v9.9.0/features/replay-kinesis-messages/index.html +++ b/docs/v9.9.0/features/replay-kinesis-messages/index.html @@ -5,7 +5,7 @@ How to replay Kinesis messages after an outage | Cumulus Documentation - + @@ -13,7 +13,7 @@
    Version: v9.9.0

    How to replay Kinesis messages after an outage

    After a period of outage, it may be necessary for a Cumulus operator to reprocess or 'replay' messages that arrived on an AWS Kinesis Data Stream but did not trigger an ingest. This document serves as an outline on how to start a replay operation, and how to perform status tracking. Cumulus supports replay of all Kinesis messages on a stream (subject to the normal RetentionPeriod constraints), or all messages within a given time slice delimited by start and end timestamps.

    As Kinesis has no comparable field to e.g. the SQS ReceiveCount on its records, Cumulus cannot tell which messages within a given time slice have never been processed, and cannot guarantee only missed messages will be processed. Users will have to rely on duplicate handling or some other method of identifying messages that should not be processed within the time slice.

    NOTE: This operation flow effectively changes only the trigger mechanism for Kinesis ingest notifications. The existence of valid Kinesis-type rules and all other normal requirements for the triggering of ingest via Kinesis still apply.

    Replays endpoint

    Cumulus has added a new endpoint to its API, /replays. This endpoint will allow you to start replay operations and returns an AsyncOperationId for operation status tracking.

    Start a replay

    In order to start a replay, you must perform a POST request to the replays endpoint.

    The required and optional fields that should be part of the body of this request are documented below.

    NOTE: As the endTimestamp relies on a comparison with the Kinesis server-side ApproximateArrivalTimestamp, and given that there is no documented level of accuracy for the approximation, it is recommended that the endTimestamp include some amount of buffer to allow for slight discrepancies. If tolerable, the same is recommended for the startTimestamp although it is used differently and less vulnerable to discrepancies since a server-side arrival timestamp should never be earlier than the client-side request timestamp.

    FieldTypeRequiredDescription
    typestringrequiredCurrently only accepts kinesis.
    kinesisStreamstringfor type kinesisAny valid kinesis stream name (not ARN)
    kinesisStreamCreationTimestamp*optionalAny input valid for a JS Date constructor. For reasons to use this field see AWS documentation on StreamCreationTimestamp.
    endTimestamp*optionalAny input valid for a JS Date constructor. Messages newer than this timestamp will be skipped.
    startTimestamp*optionalAny input valid for a JS Date constructor. Messages will be fetched from the Kinesis stream starting at this timestamp. Ignored if it is further in the past than the stream's retention period.

    Status tracking

    A successful response from the /replays endpoint will contain an asyncOperationId field. Use this ID with the /asyncOperations endpoint to track the status.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/features/reports/index.html b/docs/v9.9.0/features/reports/index.html index 911c3b32092..485f842f24f 100644 --- a/docs/v9.9.0/features/reports/index.html +++ b/docs/v9.9.0/features/reports/index.html @@ -5,7 +5,7 @@ Reconciliation Reports | Cumulus Documentation - + @@ -16,7 +16,7 @@ Screenshot of the Dashboard Rconciliation Reports Overview page

    Viewing an inventory report will show a detailed list of collections, granules and files. Screenshot of an Inventory Report page

    Viewing a granule not found report will show a list of granules missing data Screenshot of a Granule Not Found Report page

    API

    The API also allows users to create and view reports. For more extensive API documentation, see the Cumulus API docs.

    Creating a Report via API

    Create a new inventory report with the following:

    curl --request POST https://example.com/reconciliationReports --header 'Authorization: Bearer ReplaceWithToken'

    Example response:

    {
    "message": "Report is being generated",
    "status": 202
    }

    Retrieving a Report via API

    Once a report has been generated, you can retrieve the full report.

    curl https://example.com/reconciliationReports/inventoryReport-20190305T153430508 --header 'Authorization: Bearer ReplaceWithTheToken'

    Example response:

    {
    "reportStartTime": "2019-03-05T15:34:30.508Z",
    "reportEndTime": "2019-03-05T15:34:37.243Z",
    "status": "SUCCESS",
    "error": null,
    "filesInCumulus": {
    "okCount": 40,
    "onlyInS3": [
    "s3://cumulus-test-sandbox-protected/MOD09GQ.A2016358.h13v04.006.2016360104606.cmr.xml",
    "s3://cumulus-test-sandbox-private/BROWSE.MYD13Q1.A2017297.h19v10.006.2017313221201.hdf"
    ],
    "onlyInDynamoDb": [
    {
    "uri": "s3://cumulus-test-sandbox-protected/MOD09GQ.A2016358.h13v04.006.2016360104606.hdf",
    "granuleId": "MOD09GQ.A2016358.h13v04.006.2016360104606"
    }
    ]
    },
    "collectionsInCumulusCmr": {
    "okCount": 1,
    "onlyInCumulus": [
    "L2_HR_PIXC___000"
    ],
    "onlyInCmr": [
    "MCD43A1___006",
    "MOD14A1___006"
    ]
    },
    "granulesInCumulusCmr": {
    "okCount": 3,
    "onlyInCumulus": [
    {
    "granuleId": "MOD09GQ.A3518809.ln_rVr.006.7962927138074",
    "collectionId": "MOD09GQ___006"
    },
    {
    "granuleId": "MOD09GQ.A8768252.HC4ddD.006.2077696236118",
    "collectionId": "MOD09GQ___006"
    }
    ],
    "onlyInCmr": [
    {
    "GranuleUR": "MOD09GQ.A0002421.oD4zvB.006.4281362831355",
    "ShortName": "MOD09GQ",
    "Version": "006"
    }
    ]
    },
    "filesInCumulusCmr": {
    "okCount": 11,
    "onlyInCumulus": [
    {
    "fileName": "MOD09GQ.A8722843.GTk5A3.006.4026909316904.jpeg",
    "uri": "s3://cumulus-test-sandbox-public/MOD09GQ___006/MOD/MOD09GQ.A8722843.GTk5A3.006.4026909316904.jpeg",
    "granuleId": "MOD09GQ.A8722843.GTk5A3.006.4026909316904"
    }
    ],
    "onlyInCmr": [
    {
    "URL": "https://cumulus-test-sandbox-public.s3.amazonaws.com/MOD09GQ___006/MOD/MOD09GQ.A8722843.GTk5A3.006.4026909316904_ndvi.jpg",
    "Type": "GET DATA",
    "GranuleUR": "MOD09GQ.A8722843.GTk5A3.006.4026909316904"
    }
    ]
    }
    }
    - + \ No newline at end of file diff --git a/docs/v9.9.0/getting-started/index.html b/docs/v9.9.0/getting-started/index.html index b8e7fd8a5a9..382a08e00e9 100644 --- a/docs/v9.9.0/getting-started/index.html +++ b/docs/v9.9.0/getting-started/index.html @@ -5,13 +5,13 @@ Getting Started | Cumulus Documentation - +
    Version: v9.9.0

    Getting Started

    Overview | Quick Tutorials | Helpful Tips

    Overview

    This serves as a guide for new Cumulus users to deploy and learn how to use Cumulus. Here you will learn what you need in order to complete any prerequisites, what Cumulus is and how it works, and how to successfully navigate and deploy a Cumulus environment.

    What is Cumulus

    Cumulus is an open source set of components for creating cloud-based data ingest, archive, distribution and management designed for NASA's future Earth Science data streams.

    Who uses Cumulus

    Data integrators/developers and operators across projects not limited to NASA use Cumulus for their daily work functions.

    Cumulus Roles

    Integrator/Developer

    Cumulus integrators/developers are those who work within Cumulus and AWS for deployments and to manage workflows.

    Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections.

    Role Guides

    As a developer, integrator, or operator, you will need to set up your environments to work in Cumulus. The following docs can get you started in your role specific activities.

    What is a Cumulus Data Type

    In Cumulus, we have the following types of data that you can create and manage:

    • Collections
    • Granules
    • Providers
    • Rules
    • Workflows
    • Executions
    • Reports

    For details on how to create or manage data types go to Data Management Types.


    Quick Tutorials

    Deployment & Configuration

    Cumulus is deployed to an AWS account, so you must have access to deploy resources to an AWS account to get started.

    1. Deploy Cumulus and Cumulus Dashboard to AWS

    Follow the deployment instructions to deploy Cumulus to your AWS account.

    2. Configure and Run the HelloWorld Workflow

    If you have deployed using the cumulus-template-deploy repository, you have a HelloWorld workflow deployed to your Cumulus backend.

    You can see your deployed workflows on the Workflows page of your Cumulus dashboard.

    Configure a collection and provider using the setup guidance on the Cumulus dashboard.

    Then create a rule to trigger your HelloWorld workflow. You can select a rule type of one time.

    Navigate to the Executions page of the dashboard to check the status of your workflow execution.

    3. Configure a Custom Workflow

    See Developing a custom workflow documentation for adding a new workflow to your deployment.

    There are plenty of workflow examples using Cumulus tasks here. The Data Cookbooks provide a more in-depth look at some of these more advanced workflows and their configurations.

    There is a list of Cumulus tasks already included in your deployment here.

    After configuring your workflow and redeploying, you can configure and run your workflow using the same steps as in step 2.


    Helpful Tips

    Here are some useful tips to keep in mind when deploying or working in Cumulus.

    Integrator/Developer

    • Versioning and Releases: This documentation gives information on our global versioning approach. We suggest upgrading to the supported version for Cumulus, Cumulus dashboard, and Thin Egress App (TEA).
    • Cumulus Developer Documentation: We suggest that you read through and reference this resource for development best practices in Cumulus.
    • Cumulus Deployment: We will guide you on how to manually deploy a new instance of Cumulus. In this reference, you will learn how to install Terraform, create an AWS S3 bucket, configure a compatible database, and create a Lambda layer.
    • Terraform Best Practices: This will help guide you through your Terraform configuration and Cumulus deployment. For an introduction about Terraform go here.
    • Integrator Common Use Cases: Scenarios to help integrators along in the Cumulus environment.

    Operator

    Troubleshooting

    Troubleshooting: Some suggestions to help you troubleshoot and solve issues you may encounter.

    Resources

    - + \ No newline at end of file diff --git a/docs/v9.9.0/glossary/index.html b/docs/v9.9.0/glossary/index.html index 4f3f77160c0..22d33d16b6b 100644 --- a/docs/v9.9.0/glossary/index.html +++ b/docs/v9.9.0/glossary/index.html @@ -5,13 +5,13 @@ Glossary | Cumulus Documentation - +
    Version: v9.9.0

    Glossary

    AWS Glossary

    For terms/items from Amazon/AWS not mentioned in this glossary, please refer to the AWS Glossary.

    Cumulus Glossary of Terms

    API Gateway

    Refers to AWS's API Gateway. Used by the Cumulus API.

    ARN

    Refers to an AWS "Amazon Resource Name".

    For more info, see the AWS documentation.

    AWS

    See: aws.amazon.com

    AWS Lambda/Lambda Function

    AWS's 'serverless' option. Allows the running of code without provisioning a service or managing server/ECS instances/etc.

    For more information, see the AWS Lambda documentation.

    AWS Access Keys

    Access credentials that give you access to AWS to act as a IAM user programmatically or from the command line.

    For more information, see the AWS IAM Documentation.

    Bucket

    An Amazon S3 cloud storage resource.

    For more information, see the AWS Bucket Documentation.

    CloudFormation

    An AWS service that allows you to define and manage cloud resources as a preconfigured block.

    For more information, see the AWS CloudFormation User Guide.

    Cloudformation Template

    A template that defines an AWS Cloud Formation.

    For more information, see the AWS intro page.

    Cloudwatch

    AWS service that allows logging and metrics collections on various cloud resources you have in AWS.

    For more information, see the AWS User Guide.

    Cloud Notification Mechanism (CNM)

    An interface mechanism to support cloud-based ingest messaging. For more information, see PO.DAAC's CNM Schema.

    Common Metadata Repository (CMR)

    "A high-performance, high-quality, continuously evolving metadata system that catalogs Earth Science data and associated service metadata records". For more information, see NASA's CMR page.

    Collection (Cumulus)

    Cumulus Collections are logical sets of data objects of the same data type and version.

    For more information, see cookbook reference page.

    Cumulus Message Adapter (CMA)

    A library designed to help task developers integrate step function tasks into a Cumulus workflow by adapting task input/output into the Cumulus Message format.

    For more information, see CMA workflow reference page.

    Distributed Active Archive Center (DAAC)

    Refers to a specific organization that's part of NASA's distributed system of archive centers. For more information see EOSDIS's DAAC page

    Dead Letter Queue (DLQ)

    This refers to Amazon SQS Dead-Letter Queues - these SQS queues are specifically configured to capture failed messages from other services/SQS queues/etc to allow for processing of failed messages.

    For more on DLQs, see the Amazon Documentation and the Cumulus DLQ feature page.

    Developer

    Those who setup deployment and workflow management for Cumulus. Sometimes referred to as an integrator. See integrator.

    ECS

    Amazon's Elastic Container Service. Used in Cumulus by workflow steps that require more flexibility than Lambda can provide.

    For more information, see AWS's developer guide.

    ECS Activity

    An ECS instance run via a Step Function.

    Execution (Cumulus)

    A Cumulus execution refers to a single execution of a (Cumulus) Workflow.

    GIBS

    Global Imagery Browse Services

    Granule

    A granule is the smallest aggregation of data that can be independently managed (described, inventoried, and retrieved). Granules are always associated with a collection, which is a grouping of granules. A granule is a grouping of data files.

    IAM

    AWS Identity and Access Management.

    For more information, see AWS IAMs.

    Integrator/Developer

    Those who work within Cumulus and AWS for deployments and to manage workflows.

    Kinesis

    Amazon's platform for streaming data on AWS.

    See AWS Kinesis for more information.

    Lambda

    AWS's cloud service that lets you run code without provisioning or managing servers.

    For more information, see AWS's lambda page.

    Module (Terraform)

    Refers to a terraform module.

    Node

    See node.js.

    Npm

    Node package manager.

    For more information, see npmjs.com.

    Operator

    Those who work within Cumulus to ingest/archive data and manage collections.

    PDR

    "Polling Delivery Mechanism" used in "DAAC Ingest" workflows.

    For more information, see nasa.gov.

    Packages (NPM)

    NPM hosted node.js packages. Cumulus packages can be found on NPM's site here

    Provider

    Data source that generates and/or distributes data for Cumulus workflows to act upon.

    For more information, see the Cumulus documentation.

    Rule

    Rules are configurable scheduled events that trigger workflows based on various criteria.

    For more information, see the Cumulus Rules documentation.

    S3

    Amazon's Simple Storage Service provides data object storage in the cloud. Used in Cumulus to store configuration, data and more.

    For more information, see AWS's s3 page.

    SIPS

    Science Investigator-led Processing Systems. In the context of DAAC ingest, this refers to data producers/providers.

    For more information, see nasa.gov.

    SNS

    Amazon's Simple Notification Service provides a messaging service that allows publication of and subscription to events. Used in Cumulus to trigger workflow events, track event failures, and others.

    For more information, see AWS's SNS page.

    SQS

    Amazon's Simple Queue Service.

    For more information, see AWS's SQS page.

    Stack

    A collection of AWS resources you can manage as a single unit.

    In the context of Cumulus, this refers to a deployment of the cumulus and data-persistence modules that is managed by Terraform

    Step Function

    AWS's web service that allows you to compose complex workflows as a state machine comprised of tasks (Lambdas, activities hosted on EC2/ECS, some AWS service APIs, etc). See AWS's Step Function Documentation for more information. In the context of Cumulus these are the underlying AWS service used to create Workflows.

    Terraform

    Terraform is the tool that you will use for deployment and configuration of your Cumulus environment.

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/index.html b/docs/v9.9.0/index.html index c35990c1878..8f4f244a7fe 100644 --- a/docs/v9.9.0/index.html +++ b/docs/v9.9.0/index.html @@ -5,13 +5,13 @@ Introduction | Cumulus Documentation - +
    Version: v9.9.0

    Introduction

    This Cumulus project seeks to address the existing need for a “native” cloud-based data ingest, archive, distribution, and management system that can be used for all future Earth Observing System Data and Information System (EOSDIS) data streams via the development and implementation of Cumulus. The term “native” implies that the system will leverage all components of a cloud infrastructure provided by the vendor for efficiency (in terms of both processing time and cost). Additionally, Cumulus will operate on future data streams involving satellite missions, aircraft missions, and field campaigns.

    This documentation includes both guidelines, examples, and source code docs. It is accessible at https://nasa.github.io/cumulus.


    Get To Know Cumulus

    • Getting Started - here - If you are new to Cumulus we suggest that you begin with this section to help you understand and work in the environment.
    • General Cumulus Documentation - here <- you're here

    Cumulus Reference Docs

    • Cumulus API Documentation - here
    • Cumulus Developer Documentation - here - READMEs throughout the main repository.
    • Data Cookbooks - here

    Auxiliary Guides

    • Integrator Guide - here
    • Operator Docs - here

    Contributing

    Please refer to: https://github.com/nasa/cumulus/blob/master/CONTRIBUTING.md for information. We thank you in advance.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/integrator-guide/about-int-guide/index.html b/docs/v9.9.0/integrator-guide/about-int-guide/index.html index ae77e3403e3..ddf9f6064b3 100644 --- a/docs/v9.9.0/integrator-guide/about-int-guide/index.html +++ b/docs/v9.9.0/integrator-guide/about-int-guide/index.html @@ -5,13 +5,13 @@ About Integrator Guide | Cumulus Documentation - +
    Version: v9.9.0

    About Integrator Guide

    Purpose

    The Integrator Guide is to help supplement the Cumulus documentation and Data Cookbooks. This content is for Cumulus integrators who are either new to the project or need a step-by-step resource to help them along.

    What Is A Cumulus Integrator

    Cumulus integrators are those who work within Cumulus and AWS for deployments and to manage workflows. They may perform the following functions:

    • Configure and deploy Cumulus to the AWS environment
    • Configure Cumulus workflows
    • Write custom workflow tasks
    - + \ No newline at end of file diff --git a/docs/v9.9.0/integrator-guide/int-common-use-cases/index.html b/docs/v9.9.0/integrator-guide/int-common-use-cases/index.html index f06deda2494..db22ceab6cb 100644 --- a/docs/v9.9.0/integrator-guide/int-common-use-cases/index.html +++ b/docs/v9.9.0/integrator-guide/int-common-use-cases/index.html @@ -5,13 +5,13 @@ Integrator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v9.9.0/integrator-guide/workflow-add-new-lambda/index.html b/docs/v9.9.0/integrator-guide/workflow-add-new-lambda/index.html index d528d329a47..95f009a1cc2 100644 --- a/docs/v9.9.0/integrator-guide/workflow-add-new-lambda/index.html +++ b/docs/v9.9.0/integrator-guide/workflow-add-new-lambda/index.html @@ -5,13 +5,13 @@ Workflow - Add New Lambda | Cumulus Documentation - +
    Version: v9.9.0

    Workflow - Add New Lambda

    You can develop a workflow task in AWS Lambda or Elastic Container Service (ECS). AWS ECS requires Docker. For a list of tasks to use go to our Cumulus Tasks page.

    The following steps are to help you along as you write a new Lambda that integrates with a Cumulus workflow. This will aid you with the understanding of the Cumulus Message Adapter (CMA) process.

    Steps

    1. Define New Lambda in Terraform

    2. Add Task in JSON Object

      For details on how to set up a workflow via CMA go to the CMA Tasks: Message Flow.

      You will need to assign input and output for the new task and follow the CMA contract here. This contract defines how libraries should call the cumulus-message-adapter to integrate a task into an existing Cumulus Workflow.

    3. Verify New Task

      Check the updated workflow in AWS and in Cumulus.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/integrator-guide/workflow-ts-failed-step/index.html b/docs/v9.9.0/integrator-guide/workflow-ts-failed-step/index.html index 58c47cda120..5bed81044d6 100644 --- a/docs/v9.9.0/integrator-guide/workflow-ts-failed-step/index.html +++ b/docs/v9.9.0/integrator-guide/workflow-ts-failed-step/index.html @@ -5,13 +5,13 @@ Workflow - Troubleshoot Failed Step(s) | Cumulus Documentation - +
    Version: v9.9.0

    Workflow - Troubleshoot Failed Step(s)

    Steps

    1. Locate Step
    • Go to Cumulus dashboard
    • Find the granule
    • Go to Executions to determine the failed step
    1. Investigate in Cloudwatch
    • Go to Cloudwatch
    • Locate lambda
    • Search Cloudwatch logs
    1. Recreate Error

      In your sandbox environment, try to recreate the error.

    2. Resolution

    - + \ No newline at end of file diff --git a/docs/v9.9.0/interfaces/index.html b/docs/v9.9.0/interfaces/index.html index 153cd6b8a1b..969213ef2b3 100644 --- a/docs/v9.9.0/interfaces/index.html +++ b/docs/v9.9.0/interfaces/index.html @@ -5,13 +5,13 @@ Interfaces | Cumulus Documentation - +
    Version: v9.9.0

    Interfaces

    Cumulus has multiple interfaces that allow interaction with discrete components of the system, such as starting workflows via SNS/Kinesis/SQS, manually queueing workflow start messages, submitting SNS notifications for completed workflows, and the many operations allowed by the Cumulus API.

    The diagram below illustrates the workflow process in detail and the various interfaces that allow starting of workflows, reporting of workflow information, and database create operations that occur when a workflow reporting message is processed. For interfaces with expected input or output schemas, details are provided below.

    Note: This diagram is current of v1.18.0.

    Architecture diagram showing the interfaces for triggering and reporting of Cumulus workflow executions

    Workflow triggers and queuing

    Kinesis stream

    As a Kinesis stream is consumed by the messageConsumer Lambda to queue workflow executions, the incoming event is validated against this consumer schema by the ajv package.

    SQS queue for executions

    The messages put into the SQS queue for executions should conform to the Cumulus message format.

    Workflow executions

    See the documentation on Cumulus workflows.

    Workflow reporting

    SNS reporting topics

    For granule and PDR reporting, the topics will only receive data if the Cumulus workflow execution message meets the following criteria:

    • Granules - workflow message contains granule data in payload.granules
    • PDRs - workflow message contains PDR data in payload.pdr

    The messages published to the SNS reporting topics for executions and PDRs and the record property in the messages published to the granules SNS topic should conform to the model schema for each data type.

    Further detail on workflow reporting and how to interact with these interfaces can be found in the workflow notifications data cookbook.

    Cumulus API

    See the Cumulus API documentation.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/about-operator-docs/index.html b/docs/v9.9.0/operator-docs/about-operator-docs/index.html index 2db806a7f07..9df95d9c9f8 100644 --- a/docs/v9.9.0/operator-docs/about-operator-docs/index.html +++ b/docs/v9.9.0/operator-docs/about-operator-docs/index.html @@ -5,13 +5,13 @@ About Operator Docs | Cumulus Documentation - +
    Version: v9.9.0

    About Operator Docs

    Purpose

    Operator Docs are an augmentation to Cumulus documentation and Data Cookbooks. These documents will walk step-by-step through common Cumulus activities (that aren't necessarily as use-case directed as what you'd see in Data Cookbooks).

    What Is A Cumulus Operator

    Cumulus operators are those who work within Cumulus to ingest/archive data and manage collections. They may perform the following functions via the operator dashboard or API:

    • Configure providers and collections
    • Configure rules and monitor workflow executions
    • Monitor granule ingestion
    • Monitor system metrics
    - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/bulk-operations/index.html b/docs/v9.9.0/operator-docs/bulk-operations/index.html index 3c26fd1bda3..cf2753d54fb 100644 --- a/docs/v9.9.0/operator-docs/bulk-operations/index.html +++ b/docs/v9.9.0/operator-docs/bulk-operations/index.html @@ -5,14 +5,14 @@ Bulk Operations | Cumulus Documentation - +
    Version: v9.9.0

    Bulk Operations

    Cumulus implements bulk operations through the use of AsyncOperations, which are long-running processes executed on an AWS ECS cluster.

    Submitting a bulk API request

    Bulk operations are generally submitted via the endpoint for the relevant data type, e.g. granules. For a list of supported API requests, refer to the Cumulus API documentation. Bulk operations are denoted with the keyword 'bulk'.

    Starting bulk operations from the Cumulus dashboard

    Using a Kibana query

    Note: You must have configured your dashboard build with a KIBANAROOT environment variable in order for the Kibana link to render in the bulk granules modal

    1. From the Granules dashboard page, click on the "Run Bulk Granules" button, then select what type of action you would like to perform

      • Note: the rest of the process is the same regardless of what type of bulk action you perform
    2. From the bulk granules modal, click the "Open Kibana" link:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations

    3. Once you have accessed Kibana, navigate to the "Discover" page. If this is your first time using Kibana, you may see a message like this at the top of the page:

      In order to visualize and explore data in Kibana, you'll need to create an index pattern to retrieve data from Elasticsearch.

      In that case, see the docs for creating an index pattern for Kibana

      Screenshot of Kibana user interface showing the &quot;Discover&quot; page for running queries

    4. Enter a query that returns the granule records that you want to use for bulk operations:

      Screenshot of Kibana user interface showing an example Kibana query and results

    5. Once the Kibana query is returning the results you want, click the "Inspect" link near the top of the page. A slide out tab with request details will appear on the right side of the page:

      Screenshot of Kibana user interface showing details of an example request

    6. In the slide out tab that appears on the right side of the page, click the "Request" link near the top and scroll down until you see the query property:

      Screenshot of Kibana user interface showing the Elasticsearch data request made for a given Kibana query

    7. Highlight and copy the query contents from Kibana. Go back to the Cumulus dashboard and paste the query contents from Kibana inside of the query property in the bulk granules request payload. It is expected that you should have a property of query nested inside of the existing query property:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query information populated

    8. Add values for the index and workflowName to the bulk granules request payload. The value for index will vary based on your Elasticsearch setup, but it is good to target an index specifically for granule data if possible:

      Screenshot of Cumulus dashboard showing modal window for triggering bulk granule operations with query, index, and workflow information populated

    9. Click the "Run Bulk Operations" button. You should see a confirmation message, including an ID for the async operation that was started to handle your bulk action. You can track the status of this async operation on the Operations dashboard page, which can be visited by clicking the "Go To Operations" button:

      Screenshot of Cumulus dashboard showing confirmation message with async operation ID for bulk granules request

    Creating an index pattern for Kibana

    1. Define the index pattern for the indices that your Kibana queries should use. A wildcard character, *, will match across multiple indices. Once you are satisfied with your index pattern, click the "Next step" button:

      Screenshot of Kibana user interface for defining an index pattern

    2. Choose whether to use a Time Filter for your data, which is not required. Then click the "Create index pattern" button:

      Screenshot of Kibana user interface for configuring the settings of an index pattern

    Status Tracking

    All bulk operations return an AsyncOperationId which can be submitted to the /asyncOperations endpoint.

    The /asyncOperations endpoint allows listing of AsyncOperation records as well as record retrieval for individual records, which will contain the status. The Cumulus API documentation shows sample requests for these actions.

    The Cumulus Dashboard also includes an Operations monitoring page, where operations and their status are visible:

    Screenshot of Cumulus Dashboard Operations Page showing 5 operations and their status, ID, description, type and creation timestamp

    - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/cmr-operations/index.html b/docs/v9.9.0/operator-docs/cmr-operations/index.html index 1baa1b3eed9..35bb33f2856 100644 --- a/docs/v9.9.0/operator-docs/cmr-operations/index.html +++ b/docs/v9.9.0/operator-docs/cmr-operations/index.html @@ -5,7 +5,7 @@ CMR Operations | Cumulus Documentation - + @@ -16,7 +16,7 @@ UpdateCmrAccessConstraints will update CMR metadata file contents on S3, and PostToCmr will push the updates to CMR. The rest of this section will assume you have created this workflow under the name UpdateCmrAccessConstraints.

    Once created and deployed, the workflow is available in the Cumulus dashboard's Execute workflow selector. However, note that additional configuration is required for this request, to supply an access constraint integer value and optional description to the UpdateCmrAccessConstraints workflow, by clicking the Add Custom Workflow Meta option in the Execute popup, as shown below:

    Screenshot showing granule execute popup with &#39;updateCmrAccessConstraints&#39; selected and configuration values shown in a collapsible JSON field

    An example invocation of the API to perform this action is:

    $ curl --request PUT https://example.com/granules/MOD11A1.A2017137.h19v16.006.2017138085750 \
    --header 'Authorization: Bearer ReplaceWithTheToken' \
    --header 'Content-Type: application/json' \
    --data '{
    "action": "applyWorkflow",
    "workflow": "updateCmrAccessConstraints",
    "meta": {
    accessConstraints: {
    value: 5,
    description: "sample access constraint"
    }
    }
    }'

    Supported CMR metadata formats for the above operation are Echo10XML and UMMG-JSON, which will populate the RestrictionFlag and RestrictionComment fields in Echo10XML, or the AccessConstraints values in UMMG-JSON.

    Additional Operations

    At this time Cumulus does not, out of the box, support additional operations on CMR metadata. However, given the examples shown above, we recommend working with your integrators to develop additional workflows that perform any required operations.

    Bulk CMR operations

    In order to perform the above operations in bulk, Cumulus supports the use of ApplyWorkflow in an AsyncOperation. These are accessed via the Bulk Operation button on the dashboard, or the /granules/bulk endpoint on the Cumulus API.

    More information on bulk operations are in the bulk operations operator doc.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/create-rule-in-cumulus/index.html b/docs/v9.9.0/operator-docs/create-rule-in-cumulus/index.html index 2181a1ef761..d5b70537183 100644 --- a/docs/v9.9.0/operator-docs/create-rule-in-cumulus/index.html +++ b/docs/v9.9.0/operator-docs/create-rule-in-cumulus/index.html @@ -5,13 +5,13 @@ Create Rule In Cumulus | Cumulus Documentation - +
    Version: v9.9.0

    Create Rule In Cumulus

    Once the above files are in place and the entries created in CMR and Cumulus, we are ready to begin ingesting data. Depending on the type of ingestion (FTP/Kinesis, etc) the values below will change, but for the most part they are all similar. Rules tell Cumulus how to associate providers and collections, and when/how to start processing a workflow.

    Steps

    1. Go To Rules Page
    • Go to the Cumulus dashboard, click on Rules in the navigation.
    • Click Add Rule.

    Screenshot of Rules page

    1. Complete Form
    • Fill out the template form.

    Screenshot of a Rules template for adding a new rule

    For more details regarding the field definitions and required information go to Data Cookbooks.

    Note: If the state field is left blank, it defaults to false.

    Examples

    • A rule form with completed required fields:

    Screenshot of a completed rule form

    • A successfully added Rule:

    Screenshot of created rule

    - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/discovery-filtering/index.html b/docs/v9.9.0/operator-docs/discovery-filtering/index.html index 98e315cd880..49dd255cc65 100644 --- a/docs/v9.9.0/operator-docs/discovery-filtering/index.html +++ b/docs/v9.9.0/operator-docs/discovery-filtering/index.html @@ -5,7 +5,7 @@ Discovery Filtering | Cumulus Documentation - + @@ -24,7 +24,7 @@ directly list the provider_path. If the path contains regular expression components, this may fail.

    It is recommended that operators diagnose any failures by checking error logs and ensuring that permissions on the remote file system allow reading of the default directory and any subdirectories that match the filter.

    Supported protocols

    Currently support for this feature is limited to the following protocols:

    • ftp
    • sftp
    - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/granule-workflows/index.html b/docs/v9.9.0/operator-docs/granule-workflows/index.html index 27af7153077..acbd9f3eb28 100644 --- a/docs/v9.9.0/operator-docs/granule-workflows/index.html +++ b/docs/v9.9.0/operator-docs/granule-workflows/index.html @@ -5,13 +5,13 @@ Granule Workflows | Cumulus Documentation - +
    Version: v9.9.0

    Granule Workflows

    Failed Granule

    Delete and Ingest

    1. Delete Granule

    Note: Granules published to CMR will need to be removed from CMR via the dashboard prior to deletion

    1. Ingest Granule via Ingest Rule
    • Re-trigger a one-time, kinesis, SQS, or SNS rule or a scheduled rule will re-discover and reingest the deleted granule.

    Reingest

    1. Select Failed Granule
    • In the Cumulus dashboard, go to the Collections page.
    • Use search field to find the granule.
    1. Re-ingest Granule
    • Go to the Collections page.
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of the Reingest modal workflow

    Delete and Ingest

    1. Bulk Delete Granules
    • Go to the Granules page.
    • Use the Bulk Delete button to bulk delete selected granules or select via a Kibana query

    Note: You can optionally force deletion from CMR

    1. Ingest Granules via Ingest Rule
    • Re-trigger one-time, kinesis, SQS, or SNS rules or scheduled rules will re-discover and reingest the deleted granule.

    Multiple Failed Granules

    1. Select Failed Granules
    • In the Cumulus dashboard, go to the Collections page.
    • Click on Failed Granules.
    • Select multiple granules.

    Screenshot of selected multiple granules

    1. Bulk Re-ingest Granules
    • Click on Reingest and a modal will pop up for your confirmation.

    Screenshot of Bulk Reingest modal workflow

    - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/kinesis-stream-for-ingest/index.html b/docs/v9.9.0/operator-docs/kinesis-stream-for-ingest/index.html index 8f2b9517152..76e234bcc1d 100644 --- a/docs/v9.9.0/operator-docs/kinesis-stream-for-ingest/index.html +++ b/docs/v9.9.0/operator-docs/kinesis-stream-for-ingest/index.html @@ -5,13 +5,13 @@ Setup Kinesis Stream & CNM Message | Cumulus Documentation - +
    Version: v9.9.0

    Setup Kinesis Stream & CNM Message

    Note: Keep in mind that you should only have to set this up once per ingest stream. Kinesis pricing is based on the shard value and not on amount of kinesis usage.

    1. Create a Kinesis Stream

      • In your AWS console, go to the Kinesis service and click Create Data Stream.
      • Assign a name to the stream.
      • Apply a shard value of 1.
      • Click on Create Kinesis Stream.
      • A status page with stream details display. Once the status is active then the stream is ready to use. Keep in mind to record the streamName and StreamARN for later use.

      Screenshot of AWS console page for creating a Kinesis stream

    2. Create a Rule

    3. Send a message

      • Send a message that makes your schema using python or by your command line.
      • The streamName and Collection must match the kinesisArn+collection defined in the rule that you have created in Step 2.
    - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/locating-access-logs/index.html b/docs/v9.9.0/operator-docs/locating-access-logs/index.html index 11d9b72cc66..534aec1c042 100644 --- a/docs/v9.9.0/operator-docs/locating-access-logs/index.html +++ b/docs/v9.9.0/operator-docs/locating-access-logs/index.html @@ -5,13 +5,13 @@ Locating S3 Access Logs | Cumulus Documentation - +
    Version: v9.9.0

    Locating S3 Access Logs

    When enabling S3 Access Logs for EMS Reporting you configured a TargetBucket and TargetPrefix. Inside the TargetBucket at the TargetPrefix is where you will find the raw S3 access logs.

    In a standard deployment, this will be your stack's <internal bucket name> and a key prefix of <stack>/ems-distribution/s3-server-access-logs/

    - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/naming-executions/index.html b/docs/v9.9.0/operator-docs/naming-executions/index.html index 070b8d3ed95..c33cb3da7c9 100644 --- a/docs/v9.9.0/operator-docs/naming-executions/index.html +++ b/docs/v9.9.0/operator-docs/naming-executions/index.html @@ -5,7 +5,7 @@ Naming Executions | Cumulus Documentation - + @@ -21,7 +21,7 @@ QueuePdrs step.

    In the following excerpt, the QueueGranules config.executionNamePrefix property is set using the value configured in the workflow's meta.executionNamePrefix.

    Please note: This meta.executionNamePrefix property should not be confused with the optional rule executionNamePrefix property from the previous section. Setting executionNamePrefix as a root property of the rule will set a prefix for the names of any workflows triggered by the rule. Setting meta.executionNamePrefix on the rule will set meta.executionNamePrefix in the workflow messages generated for this rule, allowing workflow steps like QueueGranules to read from the message meta.executionNamePrefix for their config. Then, workflows scheduled by QueueGranules would use the configured execution name prefix.

    Setting executionNamePrefix config for QueueGranules using rule.meta

    If you wanted to use a prefix of "my-prefix", you would create a rule with a meta property similar to the following Rule snippet:

    {
    ...other rule keys here...
    "meta":
    {
    "executionNamePrefix": "my-prefix"
    }
    }

    The value of meta.executionNamePrefix from the rule will be set as meta.executionNamePrefix in the workflow message.

    Then, the workflow could contain a "QueueGranules" step with the following state, which uses meta.executionNamePrefix from the message as the value for the executionNamePrefix config to the "QueueGranules" step:

    {
    "QueueGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "queueUrl": "${start_sf_queue_url}",
    "provider": "{$.meta.provider}",
    "internalBucket": "{$.meta.buckets.internal.name}",
    "stackName": "{$.meta.stack}",
    "granuleIngestWorkflow": "${ingest_granule_workflow_name}",
    "executionNamePrefix": "{$.meta.executionNamePrefix}"
    }
    }
    },
    "Type": "Task",
    "Resource": "${queue_granules_task_arn}",
    "Retry": [
    {
    "ErrorEquals": [
    "Lambda.ServiceException",
    "Lambda.AWSLambdaException",
    "Lambda.SdkClientException"
    ],
    "IntervalSeconds": 2,
    "MaxAttempts": 6,
    "BackoffRate": 2
    }
    ],
    "Catch": [
    {
    "ErrorEquals": [
    "States.ALL"
    ],
    "ResultPath": "$.exception",
    "Next": "WorkflowFailed"
    }
    ],
    "End": true
    },
    }
    - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/ops-common-use-cases/index.html b/docs/v9.9.0/operator-docs/ops-common-use-cases/index.html index 4cc66f3fc5d..fc433a3389f 100644 --- a/docs/v9.9.0/operator-docs/ops-common-use-cases/index.html +++ b/docs/v9.9.0/operator-docs/ops-common-use-cases/index.html @@ -5,13 +5,13 @@ Operator Common Use Cases | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v9.9.0/operator-docs/trigger-workflow/index.html b/docs/v9.9.0/operator-docs/trigger-workflow/index.html index e45ce9b2af6..dcb10716883 100644 --- a/docs/v9.9.0/operator-docs/trigger-workflow/index.html +++ b/docs/v9.9.0/operator-docs/trigger-workflow/index.html @@ -5,13 +5,13 @@ Trigger a Workflow Execution | Cumulus Documentation - +
    Version: v9.9.0

    Trigger a Workflow Execution

    To trigger a workflow, you need to create a rule. To trigger an ingest workflow, one that requires discovering and ingesting data, you will also need to configure the collection and provider and associate those to a rule.

    Trigger a HelloWorld Workflow

    To trigger a HelloWorld workflow that does not need to discover or archive data, you just need to create a rule.

    You can leave the provider and collection blank and do not need any additional metadata. If you create a onetime rule, the workflow execution will start momentarily and you can view its status on the Executions page.

    Trigger an Ingest Workflow

    To ingest data, you will need a provider and collection configured to tell your workflow where to discover data and where to archive the data respectively.

    Follow the instructions to create a provider and create a collection and configure their fields for your data ingest.

    In the rule's additional metadata you can specify a provider_path from which to get the data from the provider.

    Example: Ingest data from S3

    Setup

    Assume there are 2 files to be ingested in an S3 bucket called discovery-bucket, located in the test-data folder:

    • GRANULE.A2017025.jpg
    • GRANULE.A2017025.hdf

    Archive buckets should already be created and mapped to public / private / protected in the Cumulus deployment.

    For example:

    buckets = {
    private = {
    name = "discovery-bucket"
    type = "private"
    },
    protected = {
    name = "archive-protected"
    type = "protected"
    }
    public = {
    name = "archive-public"
    type = "public"
    }
    }

    Create a provider

    Create a new provider. Set protocol to S3 and Host to discovery-bucket.

    Screenshot of adding a sample S3 provider

    Create a collection

    Create a new collection. Configure the collection to extract the granule id from the filenames and configure where to store the granule files.

    The configuration below will store hdf files in the protected bucket and jpg files in the private bucket. The bucket types are

    {
    "name": "test-collection",
    "version": "001",
    "granuleId": "^GRANULE\\.A[\\d]{7}$",
    "granuleIdExtraction": "(GRANULE\\..*)(\\.hdf|\\.jpg)",
    "reportToEms": false,
    "sampleFileName": "GRANULE.A2017025.hdf",
    "files": [
    {
    "bucket": "protected",
    "regex": "^GRANULE\\.A[\\d]{7}\\.hdf$",
    "sampleFileName": "GRANULE.A2017025.hdf"
    },
    {
    "bucket": "public",
    "regex": "^GRANULE\\.A[\\d]{7}\\.jpg$",
    "sampleFileName": "GRANULE.A2017025.jpg"
    }
    ]
    }

    Create a rule

    Create a rule to trigger the workflow to discover your granule data and ingest your granule.

    Select the previously created provider and collection. See the Cumulus Discover Granules workflow for a workflow example of using Cumulus tasks to discover and queue data for ingest.

    In the rule meta, set the provider_path to test-data, so the test-data folder will be used to discover new granules.

    Screenshot of adding a Discover Granules rule

    A onetime rule will run your workflow on-demand and you can view it on the dashboard Executions page. The Cumulus Discover Granules workflow will trigger an ingest workflow and your ingested granules will be visible on the dashboard Granules page.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/tasks/index.html b/docs/v9.9.0/tasks/index.html index 14fd5b9ce17..76fa39251d8 100644 --- a/docs/v9.9.0/tasks/index.html +++ b/docs/v9.9.0/tasks/index.html @@ -5,13 +5,13 @@ Cumulus Tasks | Cumulus Documentation - +
    Version: v9.9.0

    Cumulus Tasks

    A list of reusable Cumulus tasks. Add your own.

    Tasks

    @cumulus/add-missing-file-checksums

    Add checksums to files in S3 which don't have one


    @cumulus/discover-granules

    Discover Granules in FTP/HTTP/HTTPS/SFTP/S3 endpoints


    @cumulus/discover-pdrs

    Discover PDRs in FTP and HTTP endpoints


    @cumulus/files-to-granules

    Converts array-of-files input into a granules object by extracting granuleId from filename


    @cumulus/hello-world

    Example task


    @cumulus/hyrax-metadata-updates

    Update granule metadata with hooks to OPeNDAP URL


    @cumulus/lzards-backup

    Run LZARDS backup


    @cumulus/move-granules

    Move granule files from staging to final location


    @cumulus/parse-pdr

    Download and Parse a given PDR


    @cumulus/pdr-status-check

    Checks execution status of granules in a PDR


    @cumulus/post-to-cmr

    Post a given granule to CMR


    @cumulus/queue-granules

    Add discovered granules to the queue


    @cumulus/queue-pdrs

    Add discovered PDRs to a queue


    @cumulus/queue-workflow

    Add workflow to the queue


    @cumulus/sf-sqs-report

    Sends an incoming Cumulus message to SQS


    @cumulus/sync-granule

    Download a given granule


    @cumulus/test-processing

    Fake processing task used for integration tests


    @cumulus/update-cmr-access-constraints

    Updates CMR metadata to set access constraints


    Update CMR metadata files with correct online access urls and etags and transfer etag info to granules' CMR files

    - + \ No newline at end of file diff --git a/docs/v9.9.0/team/index.html b/docs/v9.9.0/team/index.html index 7b2f27eeb30..8cdacda7119 100644 --- a/docs/v9.9.0/team/index.html +++ b/docs/v9.9.0/team/index.html @@ -5,13 +5,13 @@ Cumulus Team | Cumulus Documentation - + - + \ No newline at end of file diff --git a/docs/v9.9.0/troubleshooting/index.html b/docs/v9.9.0/troubleshooting/index.html index d98e0d50fcf..c4d5eb68add 100644 --- a/docs/v9.9.0/troubleshooting/index.html +++ b/docs/v9.9.0/troubleshooting/index.html @@ -5,14 +5,14 @@ How to Troubleshoot and Fix Issues | Cumulus Documentation - +
    Version: v9.9.0

    How to Troubleshoot and Fix Issues

    While Cumulus is a complex system, there is a focus on maintaining the integrity and availability of the system and data. Should you encounter errors or issues while using this system, this section will help troubleshoot and solve those issues.

    Backup and Restore

    Cumulus has backup and restore functionality built-in to protect Cumulus data and allow recovery of a Cumulus stack. This is currently limited to Cumulus data and not full S3 archive data. Backup and restore is not enabled by default and must be enabled and configured to take advantage of this feature.

    For more information, read the Backup and Restore documentation.

    Elasticsearch reindexing

    If you run into issues with your Elasticsearch index, a reindex operation is available via the Cumulus API. See the Reindexing Guide.

    Information on how to reindex Elasticsearch is in the Cumulus API documentation.

    Troubleshooting Workflows

    Workflows are state machines comprised of tasks and services and each component logs to CloudWatch. The CloudWatch logs for all steps in the execution are displayed in the Cumulus dashboard or you can find them by going to CloudWatch and navigating to the logs for that particular task.

    Workflow Errors

    Visual representations of executed workflows can be found in the Cumulus dashboard or the AWS Step Functions console for that particular execution.

    If a workflow errors, the error will be handled according to the error handling configuration. The task that fails will have the exception field populated in the output, giving information about the error. Further information can be found in the CloudWatch logs for the task.

    Graph of AWS Step Function execution showing a failing workflow

    Workflow Did Not Start

    Generally, first check your rule configuration. If that is satisfactory, the answer will likely be in the CloudWatch logs for the schedule SF or SF starter lambda functions. See the workflow triggers page for more information on how workflows start.

    For Kinesis and SNS rules specifically, if an error occurs during the message consumer process, the fallback consumer lambda will be called and if the message continues to error, a message will be placed on the dead letter queue. Check the dead letter queue for a failure message. Errors can be traced back to the CloudWatch logs for the message consumer and the fallback consumer. Additionally, check that the name and version match those configured in your rule, as rules are filtered by the notification's collection name and version before scheduling executions.

    More information on kinesis error handling is here.

    Operator API Errors

    All operator API calls are funneled through the ApiEndpoints lambda. Each API call is logged to the ApiEndpoints CloudWatch log for your deployment.

    Lambda Errors

    KMS Exception: AccessDeniedException

    KMS Exception: AccessDeniedExceptionKMS Message: The ciphertext refers to a customer master key that does not exist, does not exist in this region, or you are not allowed to access.

    The above error was being thrown by cumulus lambda function invocation. The KMS key is the encryption key used to encrypt lambda environment variables. The root cause of this error is unknown, but is speculated to be caused by deleting and recreating, with the same name, the IAM role the lambda uses.

    This error can be resolved by switching the lambda's execution role to a different one and then back through the Lambda management console. Unfortunately, this approach doesn't scale well.

    The other resolution (that scales but takes some time) that was found is as follows:

    1. Comment out all lambda definitions (and dependent resources) in your Terraform configuration.
    2. terraform apply to delete the lambdas.
    3. Un-comment the definitions.
    4. terraform apply to recreate the lambdas.

    If this problem occurs with Core lambdas and you are using the terraform-aws-cumulus.zip file source distributed in our release, we recommend using the non-scaling approach as the number of lambdas we distribute is in the low teens, which are likely to be easier and faster to reconfigure one-by-one compared to editing our configs.

    Error: Unable to import module 'index': Error

    This error is shown in the CloudWatch logs for a Lambda function.

    One possible cause is that the Lambda definition in the .tf file defining the lambda is not pointing to the correct packaged lambda source file. In order to resolve this issue, update the lambda definition to point directly to the packaged (e.g. .zip) lambda source file.

    resource "aws_lambda_function" "discover_granules_task" {
    function_name = "${var.prefix}-DiscoverGranules"
    filename = "${path.module}/../../tasks/discover-granules/dist/lambda.zip"
    handler = "index.handler"
    }

    If you are seeing this error when using the Lambda as a step in a Cumulus workflow, then inspect the output for this Lambda step in the AWS Step Function console. If you see the error Cannot find module 'node_modules/@cumulus/cumulus-message-adapter-js', then you need to ensure the lambda's packaged dependencies include cumulus-message-adapter-js.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/troubleshooting/reindex-elasticsearch/index.html b/docs/v9.9.0/troubleshooting/reindex-elasticsearch/index.html index ce84f7f593f..abe4ce4ea03 100644 --- a/docs/v9.9.0/troubleshooting/reindex-elasticsearch/index.html +++ b/docs/v9.9.0/troubleshooting/reindex-elasticsearch/index.html @@ -5,7 +5,7 @@ Reindexing Elasticsearch Guide | Cumulus Documentation - + @@ -14,7 +14,7 @@ current index, or the mappings for an index have been updated (they do not update automatically). Any reindexing that will be required when upgrading Cumulus will be in the Migration Steps section of the changelog.

    Switch to a new index and Reindex

    There are two operations needed: reindex and change-index to switch over to the new index. A Change Index/Reindex can be done in either order, but both have their trade-offs.

    If you decide to point Cumulus to a new (empty) index first (with a change index operation), and then Reindex the data to the new index, data ingested while reindexing will automatically be sent to the new index. As reindexing operations can take a while, not all the data will show up on the Cumulus Dashboard right away. The advantage is you do not have to turn of any ingest operations. This way is recommended.

    If you decide to Reindex data to a new index first, and then point Cumulus to that new index, it is not guaranteed that data that is sent to the old index while reindexing will show up in the new index. If you prefer this way, it is recommended to turn off any ingest operations. This order will keep your dashboard data from seeing any interruption.

    Change Index

    This will point Cumulus to the index in Elasticsearch that will be used when retrieving data. Performing a change index operation to an index that does not exist yet will create the index for you. The change index operation can be found here.

    Reindex from the old index to the new index

    The reindex operation will take the data from one index and copy it into another index. The reindex operation can be found here

    Reindex status

    Reindexing is a long-running operation. The reindex-status endpoint can be used to monitor the progress of the operation.

    Index from database

    If you want to just grab the data straight from the database you can perform an Index from Database Operation. After the data is indexed from the database, a Change Index operation will need to be performed to ensure Cumulus is pointing to the right index. It is strongly recommended to turn off workflow rules when performing this operation so any data ingested to the database is not lost.

    Validate reindex

    To validate the reindex, use the reindex-status endpoint. The doc count can be used to verify that the reindex was successful. In the below example the reindex from cumulus-2020-11-3 to cumulus-2021-3-4 was not fully successful as they show different doc counts.

    "indices": {
    "cumulus-2020-11-3": {
    "primaries": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    },
    "total": {
    "docs": {
    "count": 21096512,
    "deleted": 176895
    }
    }
    },
    "cumulus-2021-3-4": {
    "primaries": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    },
    "total": {
    "docs": {
    "count": 715949,
    "deleted": 140191
    }
    }
    }
    }

    To further drill down into what is missing, log in to the Kibana instance (found in the Elasticsearch section of the AWS console) and run the following command replacing <index> with your index name.

    GET <index>/_search
    {
    "aggs": {
    "count_by_type": {
    "terms": {
    "field": "_type"
    }
    }
    },
    "size": 0
    }

    which will produce a result like

    "aggregations": {
    "count_by_type": {
    "doc_count_error_upper_bound": 0,
    "sum_other_doc_count": 0,
    "buckets": [
    {
    "key": "logs",
    "doc_count": 483955
    },
    {
    "key": "execution",
    "doc_count": 4966
    },
    {
    "key": "deletedgranule",
    "doc_count": 4715
    },
    {
    "key": "pdr",
    "doc_count": 1822
    },
    {
    "key": "granule",
    "doc_count": 740
    },
    {
    "key": "asyncOperation",
    "doc_count": 616
    },
    {
    "key": "provider",
    "doc_count": 108
    },
    {
    "key": "collection",
    "doc_count": 87
    },
    {
    "key": "reconciliationReport",
    "doc_count": 48
    },
    {
    "key": "rule",
    "doc_count": 7
    }
    ]
    }
    }

    Resuming a reindex

    If a reindex operation did not fully complete it can be resumed using the following command run from the Kibana instance.

    POST _reindex?wait_for_completion=false
    {
    "conflicts": "proceed",
    "source": {
    "index": "cumulus-2020-11-3"
    },
    "dest": {
    "index": "cumulus-2021-3-4",
    "op_type": "create"
    }
    }

    The Cumulus API reindex-status endpoint can be used to monitor completion of this operation.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/troubleshooting/rerunning-workflow-executions/index.html b/docs/v9.9.0/troubleshooting/rerunning-workflow-executions/index.html index cdac9c63737..f307058c20d 100644 --- a/docs/v9.9.0/troubleshooting/rerunning-workflow-executions/index.html +++ b/docs/v9.9.0/troubleshooting/rerunning-workflow-executions/index.html @@ -5,13 +5,13 @@ Re-running workflow executions | Cumulus Documentation - +
    Version: v9.9.0

    Re-running workflow executions

    To re-run a Cumulus workflow execution from the AWS console:

    1. Visit the page for an individual workflow execution

    2. Click the "New execution" button at the top right of the screen

      Screenshot of the AWS console for a Step Function execution highlighting the &quot;New execution&quot; button at the top right of the screen

    3. In the "New execution" modal that appears, replace the cumulus_meta.execution_name value in the default input with the value of the new execution ID as seen in the screenshot below

      Screenshot of the AWS console showing the modal window for entering input when running a new Step Function execution

    4. Click the "Start execution" button

    - + \ No newline at end of file diff --git a/docs/v9.9.0/troubleshooting/troubleshooting-deployment/index.html b/docs/v9.9.0/troubleshooting/troubleshooting-deployment/index.html index 323ecc6d496..3a7142cbd94 100644 --- a/docs/v9.9.0/troubleshooting/troubleshooting-deployment/index.html +++ b/docs/v9.9.0/troubleshooting/troubleshooting-deployment/index.html @@ -5,7 +5,7 @@ Troubleshooting Deployment | Cumulus Documentation - + @@ -16,7 +16,7 @@ data-persistence modules, but your config is only creating one Elasticsearch instance. To fix the issue, update the elasticsearch_config variable for your data-persistence module to increase the number of instances:

    {
    domain_name = "es"
    instance_count = 2
    instance_type = "t2.small.elasticsearch"
    version = "5.3"
    volume_size = 10
    }

    Install dashboard

    Dashboard configuration

    Issues:

    • Problem clearing the cache: EACCES: permission denied, rmdir '/tmp/gulp-cache/default'", this probably means the files at that location, and/or the folder, are owned by someone else (or some other factor prevents you from writing there).

    It's possible to workaround this by editing the file cumulus-dashboard/node_modules/gulp-cache/index.js and alter the value of the line var fileCache = new Cache({cacheDirName: 'gulp-cache'}); to something like var fileCache = new Cache({cacheDirName: '<prefix>-cache'});. Now gulp-cache will be able to write to /tmp/<prefix>-cache/default, and the error should resolve.

    Dashboard deployment

    Issues:

    • If the dashboard sends you to an Earthdata Login page that has an error reading "Invalid request, please verify the client status or redirect_uri before resubmitting", this means you've either forgotten to update one or more of your EARTHDATA_CLIENT_ID, EARTHDATA_CLIENT_PASSWORD environment variables (from your app/.env file) and re-deploy Cumulus, or you haven't placed the correct values in them, or you've forgotten to add both the "redirect" and "token" URL to the Earthdata Application.
    • There is odd caching behavior associated with the dashboard and Earthdata Login at this point in time that can cause the above error to reappear on the Earthdata Login page loaded by the dashboard even after fixing the cause of the error. If you experience this, attempt to access the dashboard in a new browser window, and it should work.
    - + \ No newline at end of file diff --git a/docs/v9.9.0/upgrade-notes/cumulus_distribution_migration/index.html b/docs/v9.9.0/upgrade-notes/cumulus_distribution_migration/index.html index dce0bd878b9..5368e9a2d59 100644 --- a/docs/v9.9.0/upgrade-notes/cumulus_distribution_migration/index.html +++ b/docs/v9.9.0/upgrade-notes/cumulus_distribution_migration/index.html @@ -5,14 +5,14 @@ Migrate from TEA deployment to Cumulus Distribution | Cumulus Documentation - +
    Version: v9.9.0

    Migrate from TEA deployment to Cumulus Distribution

    Background

    The Cumulus Distribution API is configured to use the AWS Cognito OAuth client. This API can be used instead of the Thin Egress App, which is the default distribution API if using the Deployment Template.

    Configuring a Cumulus Distribution deployment

    See these instructions for deploying the Cumulus Distribution API.

    Important note if migrating from TEA to Cumulus Distribution

    If you already have a deployment using the TEA distribution and want to switch to Cumulus Distribution, there will be an API Gateway change. This means that there will be downtime while you update your CloudFront endpoint to use the new API gateway.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/upgrade-notes/migrate_tea_standalone/index.html b/docs/v9.9.0/upgrade-notes/migrate_tea_standalone/index.html index 33912ac3491..9c9bf2c6cac 100644 --- a/docs/v9.9.0/upgrade-notes/migrate_tea_standalone/index.html +++ b/docs/v9.9.0/upgrade-notes/migrate_tea_standalone/index.html @@ -5,13 +5,13 @@ Migrate TEA deployment to standalone module | Cumulus Documentation - +
    Version: v9.9.0

    Migrate TEA deployment to standalone module

    Background

    This document is only relevant for upgrades of Cumulus from versions < 3.x.x to versions > 3.x.x

    Previous versions of Cumulus included deployment of the Thin Egress App (TEA) by default in the distribution module. As a result, Cumulus users who wanted to deploy a new version of TEA to wait on a new release of Cumulus that incorporated that release.

    In order to give Cumulus users the flexibility to deploy newer versions of TEA whenever they want, deployment of TEA has been removed from the distribution module and Cumulus users must now add the TEA module to their deployment. Guidance on integrating the TEA module to your deployment is provided, or you can refer to Cumulus core example deployment code for the thin_egress_app module.

    By default, when upgrading Cumulus and moving from TEA deployed via the distribution module to deployed as a separate module, your API gateway for TEA would be destroyed and re-created, which could cause outages for any Cloudfront endpoints pointing at that API gateway.

    These instructions outline how to modify your state to preserve your existing Thin Egress App (TEA) API gateway when upgrading Cumulus and moving deployment of TEA to a standalone module. If you do not care about preserving your API gateway for TEA when upgrading your Cumulus deployment, you can skip these instructions.

    Prerequisites

    Notes about state management

    These instructions will involve manipulating your Terraform state via terraform state mv commands. These operations are extremely dangerous, since a mistake in editing your Terraform state can leave your stack in a corrupted state where deployment may be impossible or may result in unanticipated resource deletion.

    Since bucket versioning preserves a separate version of your state file each time it is written, and the Terraform state modification commands overwrite the state file, we can mitigate the risk of these operations by downloading the most recent state file before starting the upgrade process. Then, if anything goes wrong during the upgrade, we can restore that previous state version. Guidance on how to perform both operations is provided below.

    Download your most recent state version

    Run this command to download the most recent cumulus deployment state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp s3://BUCKET/KEY /path/to/terraform.tfstate

    Restore a previous state version

    Upload the state file that was previously downloaded to the bucket/key for your state file, replacing BUCKET and KEY with the correct values from cumulus-tf/terraform.tf:

     aws s3 cp /path/to/terraform.tfstate s3://BUCKET/KEY

    Then run terraform plan, which will give an error because we manually overwrote the state file and it is now out of sync with the lock table Terraform uses to track your state file:

    Error: Error loading state: state data in S3 does not have the expected content.

    This may be caused by unusually long delays in S3 processing a previous state
    update. Please wait for a minute or two and try again. If this problem
    persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
    to manually verify the remote state and update the Digest value stored in the
    DynamoDB table to the following value: <some-digest-value>

    To resolve this error, run this command and replace DYNAMO_LOCK_TABLE, BUCKET and KEY with the correct values from cumulus-tf/terraform.tf, and use the digest value from the previous error output:

     aws dynamodb put-item \
    --table-name DYNAMO_LOCK_TABLE \
    --item '{
    "LockID": {"S": "BUCKET/KEY-md5"},
    "Digest": {"S": "some-digest-value"}
    }'

    Now, if you re-run terraform plan, it should work as expected.

    Migration instructions

    Please note: These instructions assume that you are deploying the thin_egress_app module as shown in the Cumulus core example deployment code

    1. Ensure that you have downloaded the latest version of your state file for your cumulus deployment

    2. Find the URL for your <prefix>-thin-egress-app-EgressGateway API gateway. Confirm that you can access it in the browser and that it is functional.

    3. Run terraform plan. You should see output like (edited for readability):

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be created
      + resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket.lambda_source will be created
      + resource "aws_s3_bucket" "lambda_source" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be created
      + resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be created
      + resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be created
      + resource "aws_s3_bucket_object" "lambda_source" {

      # module.thin_egress_app.aws_security_group.egress_lambda[0] will be created
      + resource "aws_security_group" "egress_lambda" {

      ...

      # module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be destroyed
      - resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source will be destroyed
      - resource "aws_s3_bucket" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be destroyed
      - resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be destroyed
      - resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source will be destroyed
      - resource "aws_s3_bucket_object" "lambda_source" {

      # module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda[0] will be destroyed
      - resource "aws_security_group" "egress_lambda" {
    4. Run the state modification commands. The commands must be run in exactly this order:

       # Move security group
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_security_group.egress_lambda module.thin_egress_app.aws_security_group.egress_lambda

      # Move TEA storage bucket
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket.lambda_source module.thin_egress_app.aws_s3_bucket.lambda_source

      # Move TEA lambda source code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_source module.thin_egress_app.aws_s3_bucket_object.lambda_source

      # Move TEA lambda dependency code
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive

      # Move TEA Cloudformation template
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_s3_bucket_object.cloudformation_template module.thin_egress_app.aws_s3_bucket_object.cloudformation_template

      # Move URS creds secret version
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret_version.thin_egress_urs_creds aws_secretsmanager_secret_version.thin_egress_urs_creds

      # Move URS creds secret
      terraform state mv module.cumulus.module.distribution.aws_secretsmanager_secret.thin_egress_urs_creds aws_secretsmanager_secret.thin_egress_urs_creds

      # Move TEA Cloudformation stack
      terraform state mv module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app module.thin_egress_app.aws_cloudformation_stack.thin_egress_app

      Depending on how you were supplying a bucket map to TEA, there may be an additional step. If you were specifying the bucket_map_key variable to the cumulus module to use a custom bucket map, then you can ignore this step and just ensure that the bucket_map_file variable to the TEA module uses that same S3 key. Otherwise, if you were letting Cumulus generate a bucket map for you, then you need to take this step to migrate that bucket map:

      # Move bucket map
      terraform state mv module.cumulus.module.distribution.aws_s3_bucket_object.bucket_map_yaml[0] aws_s3_bucket_object.bucket_map_yaml
    5. Run terraform plan again. You may still see a few additions/modifications pending like below, but you should not see any deletion of Thin Egress App resources pending:

      # module.thin_egress_app.aws_cloudformation_stack.thin_egress_app will be updated in-place
      ~ resource "aws_cloudformation_stack" "thin_egress_app" {

      # module.thin_egress_app.aws_s3_bucket_object.cloudformation_template will be updated in-place
      ~ resource "aws_s3_bucket_object" "cloudformation_template" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_code_dependency_archive will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_code_dependency_archive" {

      # module.thin_egress_app.aws_s3_bucket_object.lambda_source will be updated in-place
      ~ resource "aws_s3_bucket_object" "lambda_source" {

      If you still see deletion of module.cumulus.module.distribution.module.thin_egress_app.aws_cloudformation_stack.thin_egress_app pending, then something went wrong and you should restore the previously downloaded state file version and start over from step 1. Otherwise, proceed to step 6.

    6. Once you have confirmed that everything looks as expected, run terraform apply.

    7. Visit the same API gateway from step 1 and confirm that it still works.

    Your TEA deployment has now been migrated to a standalone module, which gives you the ability to upgrade the deployed version of TEA independently of Cumulus releases.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/upgrade-notes/upgrade-rds/index.html b/docs/v9.9.0/upgrade-notes/upgrade-rds/index.html index 3768d48de46..14eced708d8 100644 --- a/docs/v9.9.0/upgrade-notes/upgrade-rds/index.html +++ b/docs/v9.9.0/upgrade-notes/upgrade-rds/index.html @@ -5,7 +5,7 @@ Upgrade to RDS release | Cumulus Documentation - + @@ -21,7 +21,7 @@ | cutoffSeconds | number | Number of seconds prior to this execution to 'cutoff' reconciliation queries. This allows in-progress/other in-flight operations time to complete and propagate to Elasticsearch/Dynamo/postgres. | 3600 | | dbConcurrency | number | Sets max number of parallel collections reports the script will run at a time. | 20 | | dbMaxPool | number | Sets the maximum number of connections the database pool has available. Modifying this may result in unexpected failures. | 20 |

    - + \ No newline at end of file diff --git a/docs/v9.9.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html b/docs/v9.9.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html index 8c225ea56c2..e00bd619e3d 100644 --- a/docs/v9.9.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html +++ b/docs/v9.9.0/upgrade-notes/upgrade_tf_version_0.13.6/index.html @@ -5,13 +5,13 @@ Upgrade to TF version 0.13.6 | Cumulus Documentation - +
    Version: v9.9.0

    Upgrade to TF version 0.13.6

    Background

    Cumulus pins its support to a specific version of Terraform see: deployment documentation. The reason for only supporting one specific Terraform version at a time is to avoid deployment errors than can be caused by deploying to the same target with different Terraform versions.

    Cumulus is upgrading its supported version of Terraform from 0.12.12 to 0.13.6. This document contains instructions on how to perform the uprade for your deployments.

    Prerequisites

    • Follow the Terraform guidance for what to do before upgrading, notably ensuring that you have no pending changes to your Cumulus deployments before proceeding.
      • You should do a terraform plan to see if you have any pending changes for your deployment (for both the data-persistence-tf and cumulus-tf modules), and if so, run a terraform apply before doing the upgrade to Terraform 0.13.6
    • Review the Terraform v0.13 release notes to prepare for any breaking changes that may affect your custom deployment code. Cumulus' deployment code has already been updated for compatibility with version 0.13.
    • Install Terraform version 0.13.6. We recommend using Terraform Version Manager tfenv to manage your installed versons of Terraform, but this is not required.

    Upgrade your deployment code

    Terraform 0.13 does not support some of the syntax from previous Terraform versions, so you need to upgrade your deployment code for compatibility.

    Terraform provides a 0.13upgrade command as part of version 0.13 to handle automatically upgrading your code. Make sure to check out the documentation on batch usage of 0.13upgrade, which will allow you to upgrade all of your Terraform code with one command.

    Run the 0.13upgrade command until you have no more necessary updates to your deployment code.

    Upgrade your deployment

    1. Ensure that you are running Terraform 0.13.6 by running terraform --version. If you are using tfenv, you can switch versions by running tfenv use 0.13.6.

    2. For the data-persistence-tf and cumulus-tf directories, take the following steps:

      1. Run terraform init --reconfigure. The --reconfigure flag is required, otherwise you might see an error like:

        Error: Failed to decode current backend config

        The backend configuration created by the most recent run of "terraform init"
        could not be decoded: unsupported attribute "lock_table". The configuration
        may have been initialized by an earlier version that used an incompatible
        configuration structure. Run "terraform init -reconfigure" to force
        re-initialization of the backend.
      2. Run terraform apply to perform a deployment.

        WARNING: Even if Terraform says that no resource changes are pending, running the apply using Terraform version 0.13.6 will modify your backend state from version 0.12.12 to version 0.13.6 without requiring approval. Updating the backend state is a necessary part of the version 0.13.6 upgrade, but it is not completely transparent.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflow_tasks/discover_granules/index.html b/docs/v9.9.0/workflow_tasks/discover_granules/index.html index ec685babf43..093578f1d59 100644 --- a/docs/v9.9.0/workflow_tasks/discover_granules/index.html +++ b/docs/v9.9.0/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflow_tasks/files_to_granules/index.html b/docs/v9.9.0/workflow_tasks/files_to_granules/index.html index 9a1486de194..5b7823db863 100644 --- a/docs/v9.9.0/workflow_tasks/files_to_granules/index.html +++ b/docs/v9.9.0/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v9.9.0

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • name
    • bucket
    • filename
    • fileStagingDir

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflow_tasks/move_granules/index.html b/docs/v9.9.0/workflow_tasks/move_granules/index.html index 0189b2525af..363b397e21d 100644 --- a/docs/v9.9.0/workflow_tasks/move_granules/index.html +++ b/docs/v9.9.0/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v9.9.0

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflow_tasks/parse_pdr/index.html b/docs/v9.9.0/workflow_tasks/parse_pdr/index.html index 57f11ac1204..09e50a2c7b3 100644 --- a/docs/v9.9.0/workflow_tasks/parse_pdr/index.html +++ b/docs/v9.9.0/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v9.9.0

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflow_tasks/queue_granules/index.html b/docs/v9.9.0/workflow_tasks/queue_granules/index.html index 98a794d56c2..f7a1b81cd7e 100644 --- a/docs/v9.9.0/workflow_tasks/queue_granules/index.html +++ b/docs/v9.9.0/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: v9.9.0

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflows/cumulus-task-message-flow/index.html b/docs/v9.9.0/workflows/cumulus-task-message-flow/index.html index ed46ea2d301..1d642b00cef 100644 --- a/docs/v9.9.0/workflows/cumulus-task-message-flow/index.html +++ b/docs/v9.9.0/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v9.9.0

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflows/developing-a-cumulus-workflow/index.html b/docs/v9.9.0/workflows/developing-a-cumulus-workflow/index.html index 8c2e97dd00b..3835ddceaa8 100644 --- a/docs/v9.9.0/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/v9.9.0/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v9.9.0

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflows/developing-workflow-tasks/index.html b/docs/v9.9.0/workflows/developing-workflow-tasks/index.html index 83bee94b7f3..5bdcba999db 100644 --- a/docs/v9.9.0/workflows/developing-workflow-tasks/index.html +++ b/docs/v9.9.0/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v9.9.0

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflows/docker/index.html b/docs/v9.9.0/workflows/docker/index.html index 2270e874435..683b8d0947f 100644 --- a/docs/v9.9.0/workflows/docker/index.html +++ b/docs/v9.9.0/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflows/index.html b/docs/v9.9.0/workflows/index.html index 6d979eb9788..5a28d9cc42b 100644 --- a/docs/v9.9.0/workflows/index.html +++ b/docs/v9.9.0/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v9.9.0

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflows/input_output/index.html b/docs/v9.9.0/workflows/input_output/index.html index 96c8e706707..6ce233451b3 100644 --- a/docs/v9.9.0/workflows/input_output/index.html +++ b/docs/v9.9.0/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v9.9.0

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflows/lambda/index.html b/docs/v9.9.0/workflows/lambda/index.html index c41e5f68aca..d2cbdde0d88 100644 --- a/docs/v9.9.0/workflows/lambda/index.html +++ b/docs/v9.9.0/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v9.9.0

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflows/protocol/index.html b/docs/v9.9.0/workflows/protocol/index.html index 8f742d9cd8e..e2e3f915853 100644 --- a/docs/v9.9.0/workflows/protocol/index.html +++ b/docs/v9.9.0/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v9.9.0

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflows/workflow-configuration-how-to/index.html b/docs/v9.9.0/workflows/workflow-configuration-how-to/index.html index d09277d9d42..339506a870b 100644 --- a/docs/v9.9.0/workflows/workflow-configuration-how-to/index.html +++ b/docs/v9.9.0/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -15,7 +15,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.name, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/v9.9.0/workflows/workflow-triggers/index.html b/docs/v9.9.0/workflows/workflow-triggers/index.html index 80c5ca280bd..61f5e25e65d 100644 --- a/docs/v9.9.0/workflows/workflow-triggers/index.html +++ b/docs/v9.9.0/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v9.9.0

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/docs/workflow_tasks/discover_granules/index.html b/docs/workflow_tasks/discover_granules/index.html index 4b0ac3e4f98..09c2f19e8c8 100644 --- a/docs/workflow_tasks/discover_granules/index.html +++ b/docs/workflow_tasks/discover_granules/index.html @@ -5,7 +5,7 @@ Discover Granules | Cumulus Documentation - + @@ -21,7 +21,7 @@ included in a granule's file list. That is, no such filtering based on filename occurs as described above.

    When set on the task configuration, the value applies to all collections during discovery. Otherwise, this property may be set on individual collections.

    Concurrency

    A number property that determines the level of concurrency with which granule duplicate checks are performed when duplicateGranuleHandling is skip or error.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when discover-granules discovers a large number of granules with skip or error duplicate handling. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the discover-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/workflow_tasks/files_to_granules/index.html b/docs/workflow_tasks/files_to_granules/index.html index f76dd84dc72..279b3aaf1ae 100644 --- a/docs/workflow_tasks/files_to_granules/index.html +++ b/docs/workflow_tasks/files_to_granules/index.html @@ -5,13 +5,13 @@ Files To Granules | Cumulus Documentation - +
    Version: v15.0.2

    Files To Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming config.inputGranules and the task input list of s3 URIs along with the rest of the configuration objects to take the list of incoming files and sort them into a list of granule objects.

    Please note Files passed in without metadata defined previously for config.inputGranules will be added with the following keys:

    • size
    • bucket
    • key
    • fileName

    It is primarily intended to support compatibility with the standard output of a processing task, and convert that output into a granule object accepted as input by the majority of other Cumulus tasks.

    Task Inputs

    Input

    This task expects an incoming input that contains an array of 'staged' S3 URIs to move to their final archive location.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    inputGranules

    An array of Cumulus granule objects.

    This object will be used to define metadata values for the move granules task, and is the basis for the updated object that will be added to the output.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects as the payload for the next task, and returns only the expected payload for the next task.

    - + \ No newline at end of file diff --git a/docs/workflow_tasks/lzards_backup/index.html b/docs/workflow_tasks/lzards_backup/index.html index 93373e39c30..0d2aa6dc2f3 100644 --- a/docs/workflow_tasks/lzards_backup/index.html +++ b/docs/workflow_tasks/lzards_backup/index.html @@ -5,13 +5,13 @@ LZARDS Backup | Cumulus Documentation - +
    Version: v15.0.2

    LZARDS Backup

    The LZARDS backup task takes an array of granules and initiates backup requests to the LZARDS API, which will be handled asynchronously by LZARDS.

    Deployment

    The LZARDS backup task is not automatically deployed with Cumulus. To deploy the task through the Cumulus module, first you must specify a lzards_launchpad_passphrase in your terraform variables (e.g. variables.tf) like so:

    variable "lzards_launchpad_passphrase" {
    type = string
    default = ""
    }

    Then you can specify a value for your lzards_launchpad_passphrase in terraform.tfvars like so:

    lzards_launchpad_passphrase = your-passphrase

    Lastly, you need to make sure that the lzards_launchpad_passphrase is passed into the Cumulus module (in main.tf) like so:

    lzards_launchpad_passphrase  = var.lzards_launchpad_passphrase

    In short, deploying the LZARDS task requires configuring a passphrase variable and ensuring that your TF configuration passes that variable into the Cumulus module.

    Additional terraform configuration for the LZARDS task can be found in the cumulus module's variables.tf file, where the the relevant variables are prefixed with lzards_. You can add these variables to your deployment using the same process outlined above for lzards_launchpad_passphrase.

    Task Inputs

    Input

    This task expects an array of granules as input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Task Outputs

    Output

    The LZARDS task outputs a composite object containing:

    • the input granules array, and
    • a backupResults object that describes the results of LZARDS backup attempts.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    - + \ No newline at end of file diff --git a/docs/workflow_tasks/move_granules/index.html b/docs/workflow_tasks/move_granules/index.html index 942db7201e5..9be45816dc3 100644 --- a/docs/workflow_tasks/move_granules/index.html +++ b/docs/workflow_tasks/move_granules/index.html @@ -5,13 +5,13 @@ Move Granules | Cumulus Documentation - +
    Version: v15.0.2

    Move Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    This task utilizes the incoming event.input array of Cumulus granule objects to do the following:

    • Move granules from their 'staging' location to the final location (as configured in the Sync Granules task)

    • Update the event.input object with the new file locations.

    • If the granule has a ECHO10/UMM CMR file(.cmr.xml or .cmr.json) file included in the event.input:

      • Update that file's access locations

      • Add it to the appropriate access URL category for the CMR filetype as defined by granule CNM filetype.

      • Set the CMR file to 'metadata' in the output granules object and add it to the granule files if it's not already present.

        Please note: Granules without a valid CNM type set in the granule file type field in event.input will be treated as "data" in the updated CMR metadata file

    • Task then outputs an updated list of granule objects.

    Task Inputs

    Input

    This task expects an incoming input that contains a list of 'staged' S3 URIs to move to their final archive location. If CMR metadata is to be updated for a granule, it must also be included in the input.

    For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects event.input to provide an array of Cumulus granule objects. The files listed for each granule represent the files to be acted upon as described in summary.

    Task Outputs

    This task outputs an assembled array of Cumulus granule objects with post-move file locations as the payload for the next task, and returns only the expected payload for the next task. If a CMR file has been specified for a granule object, the CMR resources related to the granule files will be updated according to the updated granule file metadata.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/workflow_tasks/parse_pdr/index.html b/docs/workflow_tasks/parse_pdr/index.html index d9f713bd72c..fa04239f665 100644 --- a/docs/workflow_tasks/parse_pdr/index.html +++ b/docs/workflow_tasks/parse_pdr/index.html @@ -5,13 +5,13 @@ Parse PDR | Cumulus Documentation - +
    Version: v15.0.2

    Parse PDR

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to do the following with the incoming PDR object:

    • Stage it to an internal S3 bucket

    • Parse the PDR

    • Archive the PDR and remove the staged file if successful

    • Outputs a payload object containing metadata about the parsed PDR (e.g. total size of all files, files counts, etc) and a granules object

    The constructed granules object is created using PDR metadata to determine values like data type and version, collection definitions to determine a file storage location based on the extracted data type and version number.

    Granule file types are converted from the PDR spec types to CNM types according to the following translation table:

      HDF: 'data',
    HDF-EOS: 'data',
    SCIENCE: 'data',
    BROWSE: 'browse',
    METADATA: 'metadata',
    BROWSE_METADATA: 'metadata',
    QA_METADATA: 'metadata',
    PRODHIST: 'qa',
    QA: 'metadata',
    TGZ: 'data',
    LINKAGE: 'data'

    Files missing file types will have none assigned, files with invalid types will result in a PDR parse failure.

    Task Inputs

    Input

    This task expects an incoming input that contains name and path information about the PDR to be parsed. For the specifics, see the Cumulus Tasks page entry for the schema.

    Configuration

    This task does expect values to be set in the workflow_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    Provider

    A Cumulus provider object. Used to define connection information for retrieving the PDR.

    Bucket

    Defines the bucket where the 'pdrs' folder for parsed PDRs will be stored.

    Collection

    A Cumulus collection object. Used to define granule file groupings and granule metadata for discovered files.

    Task Outputs

    This task outputs a single payload output object containing metadata about the parsed PDR (e.g. filesCount, totalSize, etc), a pdr object with information for later steps and a the generated array of granule objects.

    Examples

    See the SIPS workflow cookbook for an example of this task in a workflow

    - + \ No newline at end of file diff --git a/docs/workflow_tasks/queue_granules/index.html b/docs/workflow_tasks/queue_granules/index.html index 39800d85890..8af494359a5 100644 --- a/docs/workflow_tasks/queue_granules/index.html +++ b/docs/workflow_tasks/queue_granules/index.html @@ -5,14 +5,14 @@ Queue Granules | Cumulus Documentation - +
    Version: v15.0.2

    Queue Granules

    This task utilizes the Cumulus Message Adapter to interpret and construct incoming and outgoing messages.

    Links to the npm package, task input, output and configuration schema definitions, and more can be found on the auto-generated Cumulus Tasks page.

    Summary

    The purpose of this task is to schedule ingest of granules that were discovered on a remote host, whether via the DiscoverGranules task or the ParsePDR task.

    The task utilizes a defined collection in concert with a defined provider, either on each granule, or passed in via config to queue up ingest executions for each granule, or for batches of granules.

    The constructed granules object is defined by the collection passed in the configuration, and has impacts to other provided core Cumulus Tasks.

    Users of this task in a workflow are encouraged to carefully consider their configuration in context of downstream tasks and workflows.

    Task Inputs

    Each of the following sections are a high-level discussion of the intent of the various input/output/config values.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Input

    This task expects an incoming input that contains granules and information about them and their files. For the specifics, see the Cumulus Tasks page entry for the schema.

    This input is most commonly the output from a preceding DiscoverGranules or ParsePDR task.

    Cumulus Configuration

    This task does expect values to be set in the task_config CMA parameters for the workflows. A schema exists that defines the requirements for the task.

    For the most recent config.json schema, please see the Cumulus Tasks page entry for the schema.

    Below are expanded descriptions of selected config keys:

    provider

    A Cumulus provider object for the originating provider. Will be passed along to the ingest workflow. This will be overruled by more specific provider information that may exist on a granule.

    internalBucket

    The Cumulus internal system bucket.

    granuleIngestWorkflow

    A string property that denotes the name of the ingest workflow into which granules should be queued.

    queueUrl

    A string property that denotes the URL of the queue to which scheduled execution messages are sent.

    preferredQueueBatchSize

    A number property that sets an upper bound on the size of each batch of granules queued into the payload of an ingest execution. Setting this property to a value higher than 1 allows queueing of multiple granules per ingest workflow.

    As ingest executions typically expect granules in the payload to have a common collection and common provider, this property only sets an upper bound within which batches will be created based on common collection and provider information.

    This means batches may be smaller than the preferred size if collection or provider information diverge, but never larger.

    The default value if none is specified is 1, which will queue one ingest execution per granule.

    concurrency

    A number property that determines the level of concurrency with which ingest executions are scheduled. Granules or batches of granules will be queued up into executions at this level of concurrency.

    This property is also used to limit concurrency when updating granule status to queued.

    Limiting concurrency helps to avoid throttling by the AWS Lambda API and helps to avoid encountering account Lambda concurrency limitations.

    We do not recommend increasing this value unless you are seeing Lambda.Timeout errors when queue-granules receives a large number of granules as input. However, as increasing the concurrency may lead to Lambda API or Lambda concurrency throttling errors, you may wish to consider converting the queue-granules task to an ECS activity, which does not face similar runtime constraints.

    The default value is 3.

    executionNamePrefix

    A string property that will prefix the names of scheduled executions.

    childWorkflowMeta

    An object property that will be merged into the scheduled execution input's meta field.

    Task Outputs

    This task outputs an assembled array of workflow execution ARNs for all scheduled workflow executions within the payload's running object.

    - + \ No newline at end of file diff --git a/docs/workflows/cumulus-task-message-flow/index.html b/docs/workflows/cumulus-task-message-flow/index.html index 0dca4259350..83f64247c4a 100644 --- a/docs/workflows/cumulus-task-message-flow/index.html +++ b/docs/workflows/cumulus-task-message-flow/index.html @@ -5,14 +5,14 @@ Cumulus Tasks: Message Flow | Cumulus Documentation - +
    Version: v15.0.2

    Cumulus Tasks: Message Flow

    Cumulus Tasks comprise Cumulus Workflows and are either AWS Lambda tasks or AWS Elastic Container Service (ECS) activities. Cumulus Tasks permit a payload as input to the main task application code. The task payload is additionally wrapped by the Cumulus Message Adapter. The Cumulus Message Adapter supplies additional information supporting message templating and metadata management of these workflows.

    Diagram showing how incoming and outgoing Cumulus messages for workflow steps are handled by the Cumulus Message Adapter

    The steps in this flow are detailed in sections below.

    Cumulus Message Format

    A full Cumulus Message has the following keys:

    • cumulus_meta: System runtime information that should generally not be touched outside of Cumulus library code or the Cumulus Message Adapter. Stores meta information about the workflow such as the state machine name and the current workflow execution's name. This information is used to look up the current active task. The name of the current active task is used to look up the corresponding task's config in task_config.
    • meta: Runtime information captured by the workflow operators. Stores execution-agnostic variables.
    • payload: Payload is runtime information for the tasks.

    In addition to the above keys, it may contain the following keys:

    • replace: A key generated in conjunction with the Cumulus Message adapter. It contains the location on S3 for a message payload and a Target JSON path in the message to extract it to.
    • exception: A key used to track workflow exceptions, should not be modified outside of Cumulus library code.

    Here's a simple example of a Cumulus Message:

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    A message utilizing the Cumulus Remote message functionality must have at least the keys replace and cumulus_meta. Depending on configuration other portions of the message may be present, however the cumulus_meta, meta, and payload keys must be present once extraction is complete.

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    Cumulus Message Preparation

    The event coming into a Cumulus Task is assumed to be a Cumulus Message and should first be handled by the functions described below before being passed to the task application code.

    Preparation Step 1: Fetch remote event

    Fetch remote event will fetch the full event from S3 if the cumulus message includes a replace key.

    Once "my-large-event.json" is fetched from S3, it's returned from the fetch remote event function. If no "replace" key is present, the event passed to the fetch remote event function is assumed to be a complete Cumulus Message and returned as-is.

    Preparation Step 2: Parse step function config from CMA configuration parameters

    This step determines what current task is being executed. Note this is different from what lambda or activity is being executed, because the same lambda or activity can be used for different tasks. The current task name is used to load the appropriate configuration from the Cumulus Message's 'task_config' configuration parameter.

    Preparation Step 3: Load nested event

    Using the config returned from the previous step, load nested event resolves templates for the final config and input to send to the task's application code.

    Task Application Code

    After message prep, the message passed to the task application code is of the form:

    {
    "input": {},
    "config": {}
    }

    Create Next Message functions

    Whatever comes out of the task application code is used to construct an outgoing Cumulus Message.

    Create Next Message Step 1: Assign outputs

    The config loaded from the Fetch step function config step may have a cumulus_message key. This can be used to "dispatch" fields from the task's application output to a destination in the final event output (via URL templating). Here's an example where the value of input.anykey would be dispatched as the value of payload.out in the final cumulus message:

    {
    "task_config": {
    "bar": "baz",
    "cumulus_message": {
    "input": "{$.payload.input}",
    "outputs": [
    {
    "source": "{$.input.anykey}",
    "destination": "{$.payload.out}"
    }
    ]
    }
    },
    "cumulus_meta": {
    "task": "Example",
    "message_source": "local",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "input": {
    "anykey": "anyvalue"
    }
    }
    }

    Create Next Message Step 2: Store remote event

    If the ReplaceConfiguration parameter is set, the configured key's value will be stored in S3 and the final output of the task will include a replace key that contains configuration for a future step to extract the payload on S3 back into the Cumulus Message. The replace key identifies where the large event node has been stored in S3.

    - + \ No newline at end of file diff --git a/docs/workflows/developing-a-cumulus-workflow/index.html b/docs/workflows/developing-a-cumulus-workflow/index.html index b4a2d8e5234..ff63981ee81 100644 --- a/docs/workflows/developing-a-cumulus-workflow/index.html +++ b/docs/workflows/developing-a-cumulus-workflow/index.html @@ -5,13 +5,13 @@ Creating a Cumulus Workflow | Cumulus Documentation - +
    Version: v15.0.2

    Creating a Cumulus Workflow

    The Cumulus workflow module

    To facilitate adding a workflows to your deployment Cumulus provides a workflow module.

    In combination with the Cumulus message, the workflow module provides a way to easily turn a Step Function definition into a Cumulus workflow, complete with:

    Using the module also ensures that your workflows will continue to be compatible with future versions of Cumulus.

    For more on the full set of current available options for the module, please consult the module README.

    Adding a new Cumulus workflow to your deployment

    To add a new Cumulus workflow to your deployment that is using the cumulus module, add a new workflow resource to your deployment directory, either in a new .tf file, or to an existing file.

    The workflow should follow a syntax similar to:

    module "my_workflow" {
    source = "https://github.com/nasa/cumulus/releases/download/vx.x.x/terraform-aws-cumulus-workflow.zip"

    prefix = "my-prefix"
    name = "MyWorkflowName"
    system_bucket = "my-internal-bucket"

    workflow_config = module.cumulus.workflow_config

    tags = { Deployment = var.prefix }

    state_machine_definition = <<JSON
    {}
    JSON
    }

    In the above example, you would add your state_machine_definition using the Amazon States Language, using tasks you've developed and Cumulus core tasks that are made available as part of the cumulus terraform module.

    Please note: Cumulus follows the convention of tagging resources with the prefix variable { Deployment = var.prefix } that you pass to the cumulus module. For resources defined outside of Core, it's recommended that you adopt this convention as it makes resources and/or deployment recovery scenarios much easier to manage.

    Examples

    For a functional example of a basic workflow, please take a look at the hello_world_workflow.

    For more complete/advanced examples, please read the following cookbook entries/topics:

    - + \ No newline at end of file diff --git a/docs/workflows/developing-workflow-tasks/index.html b/docs/workflows/developing-workflow-tasks/index.html index 17211552df0..bcea7988ce8 100644 --- a/docs/workflows/developing-workflow-tasks/index.html +++ b/docs/workflows/developing-workflow-tasks/index.html @@ -5,13 +5,13 @@ Developing Workflow Tasks | Cumulus Documentation - +
    Version: v15.0.2

    Developing Workflow Tasks

    Workflow tasks can be either AWS Lambda Functions or ECS Activities.

    Lambda functions

    The full set of available core Lambda functions can be found in the deployed cumulus module zipfile at /tasks, as well as reference documentation here. These Lambdas can be referenced in workflows via the outputs from that module (see the cumulus-template-deploy repo for an example).

    The tasks source is located in the Cumulus repository at cumulus/tasks.

    You can also develop your own Lambda function. See the Lambda Functions page to learn more.

    ECS Activities

    ECS activities are supported via the cumulus_ecs_module available from the Cumulus release page.

    Please read the module README for configuration details.

    For assistance in creating a task definition within the module read the AWS Task Definition Docs.

    For a step-by-step example of using the cumulus_ecs_module, please see the related cookbook entry.

    Cumulus Docker Image

    ECS activities require a docker image. Cumulus provides a docker image (source for node 12x+ lambdas on dockerhub: cumuluss/cumulus-ecs-task.

    Alternate Docker Images

    Custom docker images/runtimes are supported as are private registries. For details on configuring a private registry/image see the AWS documentation on Private Registry Authentication for Tasks.

    - + \ No newline at end of file diff --git a/docs/workflows/docker/index.html b/docs/workflows/docker/index.html index b1594e6b57f..3270b15635b 100644 --- a/docs/workflows/docker/index.html +++ b/docs/workflows/docker/index.html @@ -5,7 +5,7 @@ Dockerizing Data Processing | Cumulus Documentation - + @@ -14,7 +14,7 @@ 2) validate the output (in this case just check for existence) 3) use 'ncatted' to update the resulting file to be CF-compliant 4) write out metadata generated for this file

    Process Testing

    It is important to have tests for data processing, however in many cases datafiles can be large so it is not practical to store the test data in the repository. Instead, test data is currently stored on AWS S3, and can be retrieved using the AWS CLI.

    aws s3 sync s3://cumulus-ghrc-logs/sample-data/collection-name data

    Where collection-name is the name of the data collection, such as 'avaps', or 'cpl'. For example, an abridged version of the data for CPL includes:

    ├── cpl
    │   ├── input
    │   │   ├── HS3_CPL_ATB_12203a_20120906.hdf5
    │   │   ├── HS3_CPL_OP_12203a_20120906.hdf5
    │   └── output
    │   ├── HS3_CPL_ATB_12203a_20120906.nc
    │   ├── HS3_CPL_ATB_12203a_20120906.nc.meta.xml
    │   ├── HS3_CPL_OP_12203a_20120906.nc
    │   ├── HS3_CPL_OP_12203a_20120906.nc.meta.xml

    Contained in the input directory are all possible sets of data files, while the output directory is the expected result of processing. In this case the hdf5 files are converted to NetCDF files and XML metadata files are generated.

    The docker image for a process can be used on the retrieved test data. First create a test-output directory in the newly created data directory.

    mkdir data/test-output

    Then run the docker image using docker-compose.

    docker-compose run test

    This will process the data in the data/input directory and put the output into data/test-output. Repositories also include Python based tests which will validate this newly created output to the contents of data/output. Use Python's Nose tool to run the included tests.

    nosetests

    If the data/test-output directory validated against the contents of data/output the tests will be successful, otherwise an error will be reported.

    - + \ No newline at end of file diff --git a/docs/workflows/index.html b/docs/workflows/index.html index 9b630c0a818..4c341adc682 100644 --- a/docs/workflows/index.html +++ b/docs/workflows/index.html @@ -5,13 +5,13 @@ Workflows | Cumulus Documentation - +
    Version: v15.0.2

    Workflows

    Workflows are comprised of one or more AWS Lambda Functions and ECS Activities to discover, ingest, process, manage and archive data.

    Provider data ingest and GIBS have a set of common needs in getting data from a source system and into the cloud where they can be distributed to end users. These common needs are:

    • Data Discovery - Crawling, polling, or detecting changes from a variety of sources.
    • Data Transformation - Taking data files in their original format and extracting and transforming them into another desired format such as visible browse images.
    • Archival - Storage of the files in a location that's accessible to end users.

    The high level view of the architecture and many of the individual steps are the same but the details of ingesting each type of collection differs. Different collection types and different providers have different needs. The individual boxes of a workflow are not only different. The branching, error handling, and multiplicity of the arrows connecting the boxes are also different. Some need visible images rendered from component data files from multiple collections. Some need to contact the CMR with updated metadata. Some will have different retry strategies to handle availability issues with source data systems.

    AWS and other cloud vendors provide an ideal solution for parts of these problems but there needs to be a higher level solution to allow the composition of AWS components into a full featured solution. The Ingest Workflow Architecture is designed to meet the needs for Earth Science data ingest and transformation.

    Goals

    Flexibility and Composability

    The steps to ingest and process data is different for each collection within a provider. Ingest should be as flexible as possible in the rearranging of steps and configuration.

    We want to use lego-like individual steps that can be composed by an operator.

    Individual steps should ...

    • Be as ignorant as possible of the overall flow. They should not be aware of previous steps.
    • Be runnable on their own.
    • Define their input and output in simple data structures.
    • Be domain agnostic.
    • Not make assumptions of specifics of what goes into a granule for example.

    Scalable

    The ingest architecture needs to be scalable both to handle ingesting hundreds of millions of granules and interpret dozens of different workflows.

    Data Provenance

    • We should have traceability for how data was produced and where it comes from.
    • Use immutable representations of data. Data once received is not overwritten. Data can be removed for cleanup.
    • All software is versioned. We can trace transformation of data by tracking the immutable source data and the versioned software applied to it.

    Operator Visibility and Control

    • Operators should be able to see and understand everything that is happening in the system.
    • It should be obvious why things are happening and straightforward to diagnose problems.
    • We generally assume that the operators know best in terms of the limits on a providers infrastructure, how often things need to be done, and details of a collection. The architecture should defer to their decisions and knowledge while providing safety nets to prevent problems.

    A Reconfigurable Workflow Architecture

    The Ingest Workflow Architecture is defined by two entity types, Workflows and Tasks. A Workflow is a set of composed Tasks to complete an objective such as ingesting a granule. Tasks are the individual steps of a Workflow that perform one job. The workflow is responsible for executing the right task based on the current state and response from the last task executed. Tasks are completely decoupled in that they don't call each other or even need to know about the presence of other tasks.

    Workflows and tasks are configured as Terraform resources, which are triggered via configured rules within Cumulus.

    Diagram showing the Step Function execution path through workflow tasks for a collection ingest

    See the Example GIBS Ingest Architecture showing how workflows and tasks are used to define the GIBS Ingest Architecture.

    Workflows

    A workflow is a provider-configured set of steps that describe the process to ingest data. Workflows are defined using AWS Step Functions.

    Benefits of AWS Step Functions

    AWS Step functions are described in detail in the AWS documentation but they provide several benefits which are applicable to AWS.

    • Prebuilt solution
    • Operations Visibility
      • Visual diagram
      • Every execution is recorded with both inputs and output for every step.
    • Composability
      • Allow composing AWS Lambdas and code running in other steps. Code can be run in EC2 to interface with it or even on premise if desired.
      • Step functions allow specifying when steps run in parallel or choices between steps based on data from the previous step.
    • Flexibility
      • Step functions are designed to be easy to build new applications and reconfigure. We're exposing that flexibility directly to the provider.
    • Reliability and Error Handling
      • Step functions allow configuration of retries and adding handling of error conditions.
    • Described via data
      • This makes it easy to save the step function in configuration management solutions.
      • We can build simple interfaces on top of the flexibility provided.

    Workflow Scheduler

    The scheduler is responsible for initiating a step function and passing in the relevant data for a collection. This is currently configured as an interval for each collection. The scheduler service creates the initial event by combining the collection configuration with the AWS execution context defined via the cumulus terraform module.

    Tasks

    A workflow is composed of tasks. Each task is responsible for performing a discrete step of the ingest process. These can be activities like:

    • Crawling a provider website for new data.
    • Uploading data from a provider to S3.
    • Executing a process to transform data.

    AWS Step Functions permit tasks to be code running anywhere, even on premise. We expect most tasks will be written as Lambda functions in order to take advantage of the easy deployment, scalability, and cost benefits provided by AWS Lambda.

    • Leverages Existing Work
      • The design leverages the existing work of Amazon by defining workflows using the AWS Step Function State Language. This is the language that was created for describing the state machines used in AWS Step Functions.
    • Open for Extension
      • Both meta and task_config which are used for configuring at the collection and task levels do not dictate the fields and structure of the configuration. Additional task specific JSON schemas can be used for extending the validation of individual steps.
    • Data-centric Configuration
      • The use of a single JSON configuration file allows this to be added to a workflow. We build additional support on top of the configuration file for simpler domain specific configuration or interactive GUIs.

    For more details on Task Messages and Configuration, visit Cumulus configuration and message protocol documentation.

    Ingest Deploy

    To view deployment documentation, please see the Cumulus deployment documentation.

    Tradeoffs, and Benefits

    This section documents various tradeoffs and benefits of the Ingest Workflow Architecture.

    Tradeoffs

    Workflow execution is handled completely by AWS

    This means we can't add our own code into the orchestration of the workflow. We can't add new features not supported by Step Functions. We can't do things like enforce that the responses from tasks always conform to a schema or extract the configuration for a task ahead of it's execution.

    If we implemented our own orchestration we'd be able to add all of these. We save significant amounts of development effort and gain all the features of Step Functions for this trade off. One workaround is by providing a library of common task capabilities. These would optionally be available to tasks that can be implemented with Node.js and are able to include the library.

    Workflow Configuration is specified in AWS Step Function States Language

    The current design combines the states language defined by AWS with Ingest specific configuration. This means our representation has a tight coupling with their standard. If they make backwards incompatible changes in the future we will have to deal with existing projects written against that.

    We avoid having to develop our own standard and code to process it. The design can support new features in AWS Step Functions without needing to update the Ingest library code changes. It is unlikely they will make a backwards incompatible change at this point. One mitigation for this is writing data transformations to a new format if that were to happen.

    Collection Configuration Flexibility vs Complexity

    The Collections Configuration File is very flexible but requires more knowledge of AWS step functions to configure. A person modifying this file directly would need to comfortable editing a JSON file and configuring AWS Step Functions state transitions which address AWS resources.

    The configuration file itself is not necessarily meant to be edited by a human directly. Since we are developing a reconfigurable, composable architecture that specified entirely in data additional tools can be developed on top of it. The existing recipes.json files can be mapped to this format. Operational Tools like a GUI can be built that provide a usable interface for customizing workflows but it will take time to develop these tools.

    Benefits

    This section describes benefits of the Ingest Workflow Architecture.

    Simplicity

    The concepts of Workflows and Tasks are simple ones that should make sense to providers. Additionally, the implementation will only consist of a few components because the design leverages existing services and capabilities of AWS. The Ingest implementation will only consist of some reusable task code to make task implementation easier, Ingest deployment, and the Workflow Scheduler.

    Composability

    The design aims to satisfy the needs for ingest integrating different workflows for providers. It's flexible in terms of the ability to arrange tasks to meet the needs of a collection. Providers have developed and incorporated open source tools over the years. All of these are easily integrable into the workflows as tasks.

    There is low coupling between task steps. Failures of one component don't bring the whole system down. Individual tasks can be deployed separately.

    Scalability

    AWS Step Functions scale up as needed and aren't limited by a set of number of servers. They also easily allow you to leverage the inherent scalability of serverless functions.

    Monitoring and Auditing

    • Every execution is captured.
    • Every task run has captured input and outputs.
    • CloudWatch Metrics can be used for monitoring many of the events with the StepFunctions. It can also generate alarms for the whole process.
    • Visual report of the entire configuration.
      • Errors and success states are highlighted visually in the flow.

    Data Provenance

    • Monitoring and auditing ensures we know the data that was given to a task.
    • Workflows are versioned and the state machines stored in AWS Step Functions are immutable. Once created they cannot change.
    • Versioning of data in S3 or using immutable records in S3 will mean we always know what data was created as the result of a step or fed into a step.

    Appendix

    Example GIBS Ingest Architecture

    This shows the GIBS Ingest Architecture as an example of the use of the Ingest Workflow Architecture.

    • The GIBS Ingest Architecture consists of two workflows per collection type. There is one for discovery and one for ingest. The final stage of discovery triggers multiple ingest workflows for each MRF granule that needs to be generated.
    • It demonstrates both lambdas as tasks and a container used for MRF generation.

    GIBS Ingest Workflows

    Diagram showing the AWS Step Function execution path for a GIBS ingest workflow

    GIBS Ingest Granules Workflow

    This shows a visualization of an execution of the ingets granules workflow in step functions. The steps highlighted in green are the ones that executed and completed successfully.

    Diagram showing the AWS Step Function execution path for a GIBS ingest granules workflow

    - + \ No newline at end of file diff --git a/docs/workflows/input_output/index.html b/docs/workflows/input_output/index.html index 4a1e2a53e96..89d10e25ade 100644 --- a/docs/workflows/input_output/index.html +++ b/docs/workflows/input_output/index.html @@ -5,14 +5,14 @@ Workflow Inputs & Outputs | Cumulus Documentation - +
    Version: v15.0.2

    Workflow Inputs & Outputs

    General Structure

    Cumulus uses a common format for all inputs and outputs to workflows. The same format is used for input and output from workflow steps. The common format consists of a JSON object which holds all necessary information about the task execution and AWS environment. Tasks return objects identical in format to their input with the exception of a task-specific payload field. Tasks may also augment their execution metadata.

    Cumulus Message Adapter

    The Cumulus Message Adapter and Cumulus Message Adapter libraries help task developers integrate their tasks into a Cumulus workflow. These libraries adapt input and outputs from tasks into the Cumulus Message format. The Scheduler service creates the initial event message by combining the collection configuration, external resource configuration, workflow configuration, and deployment environment settings. The subsequent workflow messages between tasks must conform to the message schema. By using the Cumulus Message Adapter, individual task Lambda functions only receive the input and output specifically configured for the task, and not non-task-related message fields.

    The Cumulus Message Adapter libraries are called by the tasks with a callback function containing the business logic of the task as a parameter. They first adapt the incoming message to a format more easily consumable by Cumulus tasks, then invoke the task, and then adapt the task response back to the Cumulus message protocol to be sent to the next task.

    A task's Lambda function can be configured to include a Cumulus Message Adapter library which constructs input/output messages and resolves task configurations. The CMA can then be included in one of several ways:

    Lambda Layer

    In order to make use of this configuration, a Lambda layer must be uploaded to your account. Due to platform restrictions, Core cannot currently support sharable public layers, however you can deploy the appropriate version from the release page in two ways:

    Once you've deployed the layer, integrate the CMA layer with your Lambdas:

    • If using the cumulus module, set the cumulus_message_adapter_lambda_layer_version_arn in your .tfvars file to integrate the CMA layer with all core Cumulus lambdas.
    • If including your own Lambda or ECS task Terraform modules, specify the CMA layer ARN in the Terraform resource definitions. Also, make sure to set the CUMULUS_MESSAGE_ADAPTER_DIR environment variable for the task to /opt for the CMA integration to work properly.

    In the future if you wish to update/change the CMA version you will need to update the deployed CMA, and update the layer configuration for the impacted Lambdas as needed.

    Please Note: Updating/removing a layer does not change a deployed Lambda, so to update the CMA you should deploy a new version of the CMA layer, update the associated Lambda configuration to reference the new CMA version, and re-deploy your Lambdas.

    Manual Addition

    You can include the CMA package in the Lambda code in the cumulus-message-adapter sub-directory in your lambda .zip, for any Lambda runtime that includes a python runtime. python 2 is included in Lambda runtimes that use Amazon Linux, however Amazon Linux 2 will not support this directly.

    Please note: It is expected that upcoming Cumulus releases will update the CMA layer to include a python runtime.

    If you are manually adding the message adapter to your source and utilizing the CMA, you should set the Lambda's CUMULUS_MESSAGE_ADAPTER_DIR environment variable to target the installation path for the CMA.

    CMA Input/Output

    Input to the task application code is a json object with keys:

    • input: By default, the incoming payload is the payload output from the previous task, or it can be a portion of the payload as configured for the task in the corresponding .tf workflow definition file.
    • config: Task-specific configuration object with URL templates resolved.

    Output from the task application code is returned in and placed in the payload key by default, but the config key can also be used to return just a portion of the task output.

    CMA configuration

    As of Cumulus > 1.15 and CMA > v1.1.1, configuration of the CMA is expected to be driven by AWS Step Function Parameters.

    Using the CMA package with the Lambda by any of the above mentioned methods (Lambda Layers, manual) requires configuration for its various features via a specific Step Function Parameters configuration format (see sample workflows in the examples cumulus-tf source for more examples):

    {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": "{some config}",
    "task_config": "{some config}"
    }
    }

    The "event.$": "$" parameter is required as it passes the entire incoming message to the CMA client library for parsing, and the CMA itself to convert the incoming message into a Cumulus message for use in the function.

    The following are the CMA's current configuration settings:

    ReplaceConfig (Cumulus Remote Message)

    Because of the potential size of a Cumulus message, mainly the payload field, a task can be set via configuration to store a portion of its output on S3 with a message key Remote Message that defines how to retrieve it and an empty JSON object {} in its place. If the portion of the message targeted exceeds the configured MaxSize (defaults to 0 bytes) it will be written to S3.

    The CMA remote message functionality can be configured using parameters in several ways:

    Partial Message

    Setting the Path/Target path in the ReplaceConfig parameter (and optionally a non-default MaxSize)

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 1,
    "Path": "$.payload",
    "TargetPath": "$.payload"
    }
    }
    }
    }
    }

    will result in any payload output larger than the MaxSize (in bytes) to be written to S3. The CMA will then mark that the key has been replaced via a replace key on the event. When the CMA picks up the replace key in future steps, it will attempt to retrieve the output from S3 and write it back to payload.

    Note that you can optionally use a different TargetPath than Path, however as the target is a JSON path there must be a key to target for replacement in the output of that step. Also note that the JSON path specified must target one node, otherwise the CMA will error, as it does not support multiple replacement targets.

    If TargetPath is omitted, it will default to the value for Path.

    Full Message

    Setting the following parameters for a lambda:

    DiscoverGranules:
    Parameters:
    cma:
    event.$: '$'
    ReplaceConfig:
    FullMessage: true

    will result in the CMA assuming the entire inbound message should be stored to S3 if it exceeds the default max size.

    This is effectively the same as doing:

    {
    "DiscoverGranules": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "ReplaceConfig": {
    "MaxSize": 0,
    "Path": "$",
    "TargetPath": "$"
    }
    }
    }
    }
    }

    Cumulus Message example

    {
    "task_config": {
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    },
    "cumulus_meta": {
    "message_source": "sfn",
    "state_machine": "arn:aws:states:us-east-1:1234:stateMachine:MySfn",
    "execution_name": "MyExecution__id-1234",
    "id": "id-1234"
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Cumulus Remote Message example

    The message may contain a reference to an S3 Bucket, Key and TargetPath as follows:

    {
    "replace": {
    "Bucket": "cumulus-bucket",
    "Key": "my-large-event.json",
    "TargetPath": "$"
    },
    "cumulus_meta": {}
    }

    task_config

    This configuration key contains the input/output configuration values for definition of inputs/outputs via URL paths. Important: These values are all relative to json object configured for event.$.

    This configuration's behavior is outlined in the CMA step description below.

    The configuration should follow the format:

    {
    "FunctionName": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "other_cma_configuration": "<config object>",
    "task_config": "<task config>"
    }
    }
    }
    }

    Example:

    {
    "StepFunction": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "sfnEnd": true,
    "stack": "{$.meta.stack}",
    "bucket": "{$.meta.buckets.internal.name}",
    "stateMachine": "{$.cumulus_meta.state_machine}",
    "executionName": "{$.cumulus_meta.execution_name}",
    "cumulus_message": {
    "input": "{$}"
    }
    }
    }
    }
    }
    }

    Cumulus Message Adapter Steps

    1. Reformat AWS Step Function message into Cumulus Message

    Due to the way AWS handles Parameterized messages, when Parameters are used the CMA takes an inbound message:

    {
    "resource": "arn:aws:lambda:us-east-1:<lambda arn values>",
    "input": {
    "Other Parameter": {},
    "cma": {
    "ConfigKey": {
    "config values": "some config values"
    },
    "event": {
    "cumulus_meta": {},
    "payload": {},
    "meta": {},
    "exception": {}
    }
    }
    }
    }

    and takes the following actions:

    • Takes the object at input.cma.event and makes it the full input
    • Merges all of the keys except event under input.cma into the parent input object

    This results in the incoming message (presumably a Cumulus message) with any cma configuration parameters merged in being passed to the CMA. All other parameterized values defined outside of the cma key are ignored

    2. Resolve Remote Messages

    If the incoming Cumulus message has a replace key value, the CMA will attempt to pull the payload from S3,

    For example, if the incoming contains the following:

      "meta": {
    "foo": {}
    },
    "replace": {
    "TargetPath": "$.meta.foo",
    "Bucket": "some_bucket",
    "Key": "events/some-event-id"
    }

    The CMA will attempt to pull the file stored at Bucket/Key and replace the value at TargetPath, then remove the replace object entirely and continue.

    3. Resolve URL templates in the task configuration

    In the workflow configuration (defined under the task_config key), each task has its own configuration, and it can use URL template as a value to achieve simplicity or for values only available at execution time. The Cumulus Message Adapter resolves the URL templates (relative to the event configuration key) and then passes message to next task. For example, given a task which has the following configuration:

    {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }
    }
    }
    }

    and and incoming message that contains:

    {
    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    }
    }

    The corresponding Cumulus Message would contain:

    "meta": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    "task_config": {
    "provider": "{$.meta.provider}",
    "inlinestr": "prefix{meta.foo}suffix",
    "array": "{[$.meta.foo]}",
    "object": "{$.meta}"
    }

    The message sent to the task would be:

    "config" : {
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    },
    "inlinestr": "prefixbarsuffix",
    "array": ["bar"],
    "object": {
    "foo": "bar",
    "provider": {
    "id": "FOO_DAAC",
    "anykey": "anyvalue"
    }
    },
    },
    "input": "{...}"

    URL template variables replace dotted paths inside curly brackets with their corresponding value. If the Cumulus Message Adapter cannot resolve a value, it will ignore the template, leaving it verbatim in the string. While seemingly complex, this allows significant decoupling of Tasks from one another and the data that drives them. Tasks are able to easily receive runtime configuration produced by previously run tasks and domain data.

    4. Resolve task input

    By default, the incoming payload is the payload from the previous task. The task can also be configured to use a portion of the payload its input message. For example, given a task specifies cma.task_config.cumulus_message.input:

        ExampleTask:
    Parameters:
    cma:
    event.$: '$'
    task_config:
    cumulus_message:
    input: '{$.payload.foo}'

    The task configuration in the message would be:

        {
    "task_config": {
    "cumulus_message": {
    "input": "{$.payload.foo}"
    }
    },
    "payload": {
    "foo": {
    "anykey": "anyvalue"
    }
    }
    }

    The Cumulus Message Adapter will resolve the task input, instead of sending the whole payload as task input, the task input would be:

        {
    "input" : {
    "anykey": "anyvalue"
    },
    "config": {...}
    }

    5. Resolve task output

    By default, the task's return value is the next payload. However, the workflow task configuration can specify a portion of the return value as the next payload, and can also augment values to other fields. Based on the task configuration under cma.task_config.cumulus_message.outputs, the Message Adapter uses a task's return value to output a message as configured by the task-specific config defined under cma.task_config. The Message Adapter dispatches a "source" to a "destination" as defined by URL templates stored in the task-specific cumulus_message.outputs. The value of the task's return value at the "source" URL is used to create or replace the value of the task's return value at the "destination" URL. For example, given a task specifies cumulus_message.output in its workflow configuration as follows:

    {
    "ExampleTask": {
    "Parameters": {
    "cma": {
    "event.$": "$",
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    }
    }
    }
    }
    }

    The corresponding Cumulus Message would be:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar"
    },
    "payload": {
    "anykey": "anyvalue"
    }
    }

    Given the response from the task is:

        {
    "output": {
    "anykey": "boo"
    }
    }

    The Cumulus Message Adapter would output the following Cumulus Message:

        {
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    6. Apply Remote Message Configuration

    If the ReplaceConfig configuration parameter is defined, the CMA will evaluate the configuration options provided, and if required write a portion of the Cumulus Message to S3, and add a replace key to the message for future steps to utilize.

    Please Note: the non user-modifiable field cumulus-meta will always be retained, regardless of the configuration.

    For example, if the output message (post output configuration) from a cumulus message looks like:

        {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "ReplaceConfig": {
    "FullMessage": true
    },
    "task_config": {
    "cumulus_message": {
    "outputs": [
    {
    "source": "{$}",
    "destination": "{$.payload}"
    },
    {
    "source": "{$.output.anykey}",
    "destination": "{$.meta.baz}"
    }
    ]
    }
    },
    "meta": {
    "foo": "bar",
    "baz": "boo"
    },
    "payload": {
    "output": {
    "anykey": "boo"
    }
    }
    }

    the resultant output would look like:

    {
    "cumulus_meta": {
    "some_key": "some_value"
    },
    "replace": {
    "TargetPath": "$",
    "Bucket": "some-internal-bucket",
    "Key": "events/some-event-id"
    }
    }

    Additional features

    Validate task input, output and configuration messages against the schemas provided

    The Cumulus Message Adapter has the capability to validate task input, output and configuration messages against their schemas. The default location of the schemas is the schemas folder in the top level of the task and the default filenames are input.json, output.json, and config.json. The task can also configure a different schema location. If no schema can be found, the Cumulus Message Adapter will not validate the messages.

    - + \ No newline at end of file diff --git a/docs/workflows/lambda/index.html b/docs/workflows/lambda/index.html index afa38094b11..cd288e7008d 100644 --- a/docs/workflows/lambda/index.html +++ b/docs/workflows/lambda/index.html @@ -5,13 +5,13 @@ Develop Lambda Functions | Cumulus Documentation - +
    Version: v15.0.2

    Develop Lambda Functions

    Develop a new Cumulus Lambda

    AWS provides great getting started guide for building Lambdas in the developer guide.

    Cumulus currently supports the following environments for Cumulus Message Adapter enabled functions:

    Additionally you may chose to include any of the other languages AWS supports as a resource with reduced feature support.

    Deploy a Lambda

    Node.js Lambda

    For a new Node.js Lambda, create a new function and add an aws_lambda_function resource to your Cumulus deployment (for examples, see the example in source example/lambdas.tf and ingest/lambda-functions.tf) as either a new .tf file, or added to an existing .tf file:

    resource "aws_lambda_function" "myfunction" {
    function_name = "${var.prefix}-function"
    filename = "/path/to/zip/lambda.zip"
    source_code_hash = filebase64sha256("/path/to/zip/lambda.zip")
    handler = "index.handler"
    role = module.cumulus.lambda_processing_role_arn
    runtime = "nodejs10.x"

    vpc_config {
    subnet_ids = var.subnet_ids
    security_group_ids = var.security_group_ids
    }
    }

    Please note: This example contains the minimum set of required configuration.

    Make sure to include a vpc_config that matches the information you've provided the cumulus module if intending to integrate the lambda with a Cumulus deployment.

    Java Lambda

    Java Lambdas are created in much the same way as the Node.js example above.

    The source points to a folder with the compiled .class files and dependency libraries in the Lambda Java zip folder structure (details here), not an uber-jar.

    The deploy folder referenced here would contain a folder 'test_task/task/' which contains Task.class and TaskLogic.class as well as a lib folder containing dependency jars.

    Python Lambda

    Python Lambdas are created the same way as the Node.js example above.

    Cumulus Message Adapter

    For Lambdas wishing to utilize the Cumulus Message Adapter(CMA), you should define a layers key on your Lambda resource with the CMA you wish to include. See the input_output docs for more on how to create/use the CMA.

    Other Lambda Options

    Cumulus supports all of the options available to you via the aws_lambda_function Terraform resource. For more information on what's available, check out the Terraform resource docs.

    Cloudwatch log groups

    If you want to enable Cloudwatch logging for your Lambda resource, you'll need to add a aws_cloudwatch_log_group resource to your Lambda definition:

    resource "aws_cloudwatch_log_group" "myfunction_log_group" {
    name = "/aws/lambda/${aws_lambda_function.myfunction.function_name}"
    retention_in_days = 30
    tags = { Deployment = var.prefix }
    }
    - + \ No newline at end of file diff --git a/docs/workflows/protocol/index.html b/docs/workflows/protocol/index.html index 1094546b0be..f446ffcf437 100644 --- a/docs/workflows/protocol/index.html +++ b/docs/workflows/protocol/index.html @@ -5,13 +5,13 @@ Workflow Protocol | Cumulus Documentation - +
    Version: v15.0.2

    Workflow Protocol

    Configuration and Message Use Diagram

    A diagram showing at which point in a workflow the Cumulus message is checked for conformity with the message schema and where the configuration is checked for conformity with the configuration schema

    • Configuration - The Cumulus workflow configuration defines everything needed to describe an instance of Cumulus.
    • Scheduler - This starts ingest of a collection on configured intervals.
    • Input to Step Functions - The Scheduler uses the Configuration as source data to construct the input to the Workflow.
    • AWS Step Functions - Run the workflows as kicked off by the scheduler or other processes.
    • Input to Task - The input for each task is a JSON document that conforms to the message schema.
    • Output from Task - The output of each task must conform to the message schemas as well and is used as the input for the subsequent task.
    - + \ No newline at end of file diff --git a/docs/workflows/workflow-configuration-how-to/index.html b/docs/workflows/workflow-configuration-how-to/index.html index 578701dd9fd..f14de2515c1 100644 --- a/docs/workflows/workflow-configuration-how-to/index.html +++ b/docs/workflows/workflow-configuration-how-to/index.html @@ -5,7 +5,7 @@ Workflow Configuration How To's | Cumulus Documentation - + @@ -24,7 +24,7 @@ To take a subset of any given metadata, use the option substring.

    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{substring(file.fileName, 0, 3)}"

    This example will populate to "MOD09GQ/MOD"

    In addition to substring, several datetime-specific functions are available, which can parse a datetime string in the metadata and extract a certain part of it:

    "url_path": "{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"

    or

     "url_path": "{dateFormat(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime, YYYY-MM-DD[T]HH[:]mm[:]ss)}"

    The following functions are implemented:

    • extractYear - returns the year, formatted as YYYY
    • extractMonth - returns the month, formatted as MM
    • extractDate - returns the day of the month, formatted as DD
    • extractHour - returns the hour in 24-hour format, with no leading zero
    • dateFormat - takes a second argument describing how to format the date, and passes the metadata date string and the format argument to moment().format()

    Note: the move-granules step needs to be in the workflow for this template to be populated and the file moved. This cmrMetadata or CMR granule XML needs to have been generated and stored on S3. From there any field could be retrieved and used for a url_path.

    Adding Metadata dates and times to the URL Path

    There are a number of options to pull dates from the CMR file metadata. With this metadata:

    <Granule>
    <Temporal>
    <RangeDateTime>
    <BeginningDateTime>2003-02-19T00:00:00Z</BeginningDateTime>
    <EndingDateTime>2003-02-19T23:59:59Z</EndingDateTime>
    </RangeDateTime>
    </Temporal>
    </Granule>

    The following examples of url_path could be used.

    {extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the year from the full date: 2003.

    {extractMonth(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the month: 2.

    {extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the day: 19.

    {extractHour(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)} will pull the hour: 0.

    Different values can be combined to create the url_path. For example

    {
    "bucket": "sample-protected-bucket",
    "name": "MOD09GQ.A2017025.h21v00.006.2017034065104.hdf",
    "url_path": "{cmrMetadata.Granule.Collection.ShortName}/{extractYear(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)/extractDate(cmrMetadata.Granule.Temporal.RangeDateTime.BeginningDateTime)}"
    }

    The final file location for the above would be s3://sample-protected-bucket/MOD09GQ/2003/19/MOD09GQ.A2017025.h21v00.006.2017034065104.hdf.

    - + \ No newline at end of file diff --git a/docs/workflows/workflow-triggers/index.html b/docs/workflows/workflow-triggers/index.html index 865449b58f5..17bedb14708 100644 --- a/docs/workflows/workflow-triggers/index.html +++ b/docs/workflows/workflow-triggers/index.html @@ -5,13 +5,13 @@ Workflow Triggers | Cumulus Documentation - +
    Version: v15.0.2

    Workflow Triggers

    For a workflow to run, it needs to be associated with a rule (see rule configuration). The rule configuration determines how and when a workflow execution is triggered. Rules can be triggered one time, on a schedule, or by new data written to a kinesis stream.

    There are three lambda functions in the API package responsible for scheduling and starting workflows: SF scheduler, message consumer, and SF starter. Each Cumulus instance comes with a Start SF SQS queue.

    The SF scheduler lambda puts a message onto the Start SF queue. This message is picked up the Start SF lambda and an execution is started with the body of the message as the input.

    When a one time rule is created, the schedule SF lambda is triggered. Rules that are not one time are associated with a CloudWatch event which will manage the trigger of the lambdas that trigger the workflows.

    For a scheduled rule, the Cloudwatch event is triggered on the given schedule which calls directly to the schedule SF lambda.

    For a kinesis rule, when data is added to the kinesis stream, the Cloudwatch event is triggered, which calls the message consumer lambda. The message consumer lambda parses the kinesis message and finds all of the rules associated with that message. For each rule (which corresponds to one workflow), the schedule SF lambda is triggered to queue a message to start the workflow.

    For an sns rule, when a message is published to the SNS topic, the message consumer receives the SNS message (JSON expected), parses it into an object, starts a new execution of the workflow associated with the rule and passes the object in the payload field of the Cumulus message.

    Diagram showing how workflows are scheduled via rules

    - + \ No newline at end of file diff --git a/index.html b/index.html index d8c8469f0d4..427447a8ada 100644 --- a/index.html +++ b/index.html @@ -5,13 +5,13 @@ Cumulus Documentation - +
    - + \ No newline at end of file diff --git a/search/index.html b/search/index.html index b41f10dbe76..3e7ab07541d 100644 --- a/search/index.html +++ b/search/index.html @@ -5,13 +5,13 @@ Search the documentation | Cumulus Documentation - + - + \ No newline at end of file diff --git a/versions/index.html b/versions/index.html index 6f1b6a0177e..e2cbdbe0577 100644 --- a/versions/index.html +++ b/versions/index.html @@ -5,13 +5,13 @@ Versions | Cumulus Documentation - + - + \ No newline at end of file