job_template_launch/main.py

# Copyright 2017 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The app for the 'frontend' service, which handles cron job requests to
fetch tweets and store them in the Datastore.
"""

import datetime
import logging
import os

from google.appengine.ext import ndb
import twitter
import webapp2

from googleapiclient.discovery import build
from oauth2client.client import GoogleCredentials


class Tweet(ndb.Model):
  """Define the Tweet model."""
  user = ndb.StringProperty()
  text = ndb.StringProperty()
  created_at = ndb.DateTimeProperty()
  tid = ndb.IntegerProperty()
  urls = ndb.StringProperty(repeated=True)


class LaunchJob(webapp2.RequestHandler):
  """Launch the Dataflow pipeline using a job template."""

  def get(self):
    is_cron = self.request.headers.get('X-Appengine-Cron', False)
    # logging.info("is_cron is %s", is_cron)
    # Comment out the following check to allow non-cron-initiated requests.
    if not is_cron:
      return 'Blocked.'
    # These env vars are set in app.yaml.
    PROJECT = os.environ['PROJECT']
    BUCKET = os.environ['BUCKET']
    TEMPLATE = os.environ['TEMPLATE_NAME']

    # Because we're using the same job name each time, if you try to launch one
    # job while another is still running, the second will fail.
    JOBNAME = PROJECT + '-twproc-template'

    credentials = GoogleCredentials.get_application_default()
    service = build('dataflow', 'v1b3', credentials=credentials)

    BODY = {
            "jobName": "{jobname}".format(jobname=JOBNAME),
            "gcsPath": "gs://{bucket}/templates/{template}".format(
                bucket=BUCKET, template=TEMPLATE),
            "parameters": {"timestamp": str(datetime.datetime.utcnow())},
             "environment": {
                "tempLocation": "gs://{bucket}/temp".format(bucket=BUCKET),
                "zone": "us-central1-f"
             }
        }

    dfrequest = service.projects().templates().create(
        projectId=PROJECT, body=BODY)
    dfresponse = dfrequest.execute()
    logging.info(dfresponse)
    self.response.write('Done')


class FetchTweets(webapp2.RequestHandler):
  """Fetch home timeline tweets from the given twitter account."""

  def get(self):

    # set up the twitter client. These env vars are set in app.yaml.
    consumer_key = os.environ['CONSUMER_KEY']
    consumer_secret = os.environ['CONSUMER_SECRET']
    access_token = os.environ['ACCESS_TOKEN']
    access_token_secret = os.environ['ACCESS_TOKEN_SECRET']

    api = twitter.Api(consumer_key=consumer_key,
                      consumer_secret=consumer_secret,
                      access_token_key=access_token,
                      access_token_secret=access_token_secret)

    last_id = None
    public_tweets = None

    # see if we can get the id of the most recent tweet stored.
    tweet_entities = ndb.gql('select * from Tweet order by tid desc limit 1')
    last_id = None
    for te in tweet_entities:
      last_id = te.tid
      break
    if last_id:
      logging.info("last id is: %s", last_id)

    public_tweets = []
    # grab tweets from the home timeline of the auth'd account.
    try:
      if last_id:
        public_tweets = api.GetHomeTimeline(count=200, since_id=last_id)
      else:
        public_tweets = api.GetHomeTimeline(count=20)
        logging.warning("Could not get last tweet id from datastore.")
    except Exception as e:
      logging.warning("Error getting tweets: %s", e)

    # store the retrieved tweets in the datastore
    logging.info("got %s tweets", len(public_tweets))
    for tweet in public_tweets:
      tw = Tweet()
      # logging.info("text: %s, %s", tweet.text, tweet.user.screen_name)
      tw.text = tweet.text
      tw.user = tweet.user.screen_name
      tw.created_at = datetime.datetime.strptime(
          tweet.created_at, "%a %b %d %H:%M:%S +0000 %Y")
      tw.tid = tweet.id
      urls = tweet.urls
      urllist = []
      for u in urls:
        urllist.append(u.expanded_url)
      tw.urls = urllist
      tw.key = ndb.Key(Tweet, tweet.id)
      tw.put()

    self.response.write('Done')


class MainPage(webapp2.RequestHandler):
  def get(self):
    self.response.write('nothing to see.')


app = webapp2.WSGIApplication(
    [('/', MainPage), ('/timeline', FetchTweets),
     ('/launchtemplatejob', LaunchJob)],
    debug=True)