diff --git a/docs/bigquery-usage.rst b/docs/bigquery-usage.rst index 87b948f7eba9..9bf03710f8ea 100644 --- a/docs/bigquery-usage.rst +++ b/docs/bigquery-usage.rst @@ -236,3 +236,196 @@ Delete a table: >>> dataset = client.dataset('dataset_name') >>> table = dataset.table(name='person_ages') >>> table.delete() # API request + +Jobs +---- + +Jobs describe actions peformed on data in BigQuery tables: + +- Load data into a table +- Run a query against data in one or more tables +- Extract data from a table +- Copy a table + +List jobs for a project: + +.. doctest:: + + >>> from gcloud import bigquery + >>> client = bigquery.Client() + >>> jobs = client.jobs() # API request + >>> [(job.job_id, job.type, job.created, job.state) for job in jobs] + ['e3344fba-09df-4ae0-8337-fddee34b3840', 'insert', (datetime.datetime(2015, 7, 23, 9, 30, 20, 268260, tzinfo=), 'done')] + +Querying data (synchronous) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Run a query which can be expected to complete within bounded time: + +.. doctest:: + + >>> from gcloud import bigquery + >>> client = bigquery.Client() + >>> query = """\ + SELECT count(*) AS age_count FROM dataset_name.person_ages + """ + >>> results = client.query(query, timeout_ms=1000) + >>> retry_count = 100 + >>> while retry_count > 0 and not results.job_complete: + ... retry_count -= 1 + ... time.sleep(10) + ... results.reload() # API request + >>> results.schema + [{'name': 'age_count', 'type': 'integer', 'mode': 'nullable'}] + >>> results.rows + [(15,)] + +.. note:: + + If the query takes longer than the timeout allowed, ``results.job_complete`` + will be ``False``: we therefore poll until it is completed. + +Querying data (asynchronous) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Background a query, loading the results into a table: + +.. doctest:: + + >>> from gcloud import bigquery + >>> client = bigquery.Client() + >>> query = """\ + SELECT firstname + ' ' + last_name AS full_name, + FLOOR(DATEDIFF(CURRENT_DATE(), birth_date) / 365) AS age + FROM dataset_name.persons + """ + >>> dataset = client.dataset('dataset_name') + >>> table = dataset.table(name='person_ages') + >>> job = client.query_async(query, + ... destination=table, + ... write_disposition='truncate') + >>> job.job_id + 'e3344fba-09df-4ae0-8337-fddee34b3840' + >>> job.type + 'load' + >>> job.created + None + >>> job.state + None + +.. note:: + + - ``gcloud.bigquery`` generates a UUID for each job. + - The ``created`` and ``state`` fields are not set until the job + is submitted to the BigQuery back-end. + +Then, begin executing the job on the server: + +.. doctest:: + + >>> job.submit() # API call + >>> job.created + datetime.datetime(2015, 7, 23, 9, 30, 20, 268260, tzinfo=) + >>> job.state + 'running' + +Poll until the job is complete: + +.. doctest:: + + >>> import time + >>> retry_count = 100 + >>> while retry_count > 0 and job.state == 'running': + ... retry_count -= 1 + ... time.sleep(10) + ... job.reload() # API call + >>> job.state + 'done' + >>> job.ended + datetime.datetime(2015, 7, 23, 9, 30, 21, 334792, tzinfo=) + +Inserting data (synchronous) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Load data synchronously from a local CSV file into a new table. First, +create the job locally: + +.. doctest:: + + >>> from gcloud import bigquery + >>> client = bigquery.Client() + >>> table = dataset.table(name='person_ages') + >>> with open('/path/to/person_ages.csv', 'rb') as file_obj: + ... job = table.load_from_file( + ... file_obj, + ... source_format='CSV', + ... skip_leading_rows=1 + ... write_disposition='truncate', + ... ) # API request + >>> job.job_id + 'e3344fba-09df-4ae0-8337-fddee34b3840' + >>> job.type + 'load' + >>> job.created + datetime.datetime(2015, 7, 23, 9, 30, 20, 268260, tzinfo=) + >>> job.state + 'done' + >>> job.ended + datetime.datetime(2015, 7, 23, 9, 30, 21, 334792, tzinfo=) + +Inserting data (asynchronous) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Start a job loading data asynchronously from a set of CSV files, located on +GCloud Storage, appending rows into an existing table. First, create the job +locally: + +.. doctest:: + + >>> from gcloud import bigquery + >>> client = bigquery.Client() + >>> table = dataset.table(name='person_ages') + >>> job = table.load_from_storage(bucket_name='bucket-name', + ... object_name_glob='object-prefix*', + ... source_format='CSV', + ... skip_leading_rows=1, + ... write_disposition='truncate') + >>> job.job_id + 'e3344fba-09df-4ae0-8337-fddee34b3840' + >>> job.type + 'load' + >>> job.created + None + >>> job.state + None + +.. note:: + + - ``gcloud.bigquery`` generates a UUID for each job. + - The ``created`` and ``state`` fields are not set until the job + is submitted to the BigQuery back-end. + +Then, begin executing the job on the server: + +.. doctest:: + + >>> job.submit() # API call + >>> job.created + datetime.datetime(2015, 7, 23, 9, 30, 20, 268260, tzinfo=) + >>> job.state + 'running' + +Poll until the job is complete: + +.. doctest:: + + >>> import time + >>> retry_count = 100 + >>> while retry_count > 0 and job.state == 'running': + ... retry_count -= 1 + ... time.sleep(10) + ... job.reload() # API call + >>> job.state + 'done' + >>> job.ended + datetime.datetime(2015, 7, 23, 9, 30, 21, 334792, tzinfo=)