19
19
import platform
20
20
import re
21
21
import subprocess
22
- import sys
23
22
import time
24
- from typing import Any , Callable , Dict , Iterable , Optional
23
+ from typing import Any , Callable , Dict , Iterable , Optional , Set
25
24
import uuid
26
25
27
26
import pytest
33
32
34
33
TIMEOUT_SEC = 30 * 60 # 30 minutes in seconds
35
34
POLL_INTERVAL_SEC = 60 # 1 minute in seconds
35
+ LIST_PAGE_SIZE = 100
36
36
37
37
HYPHEN_NAME_RE = re .compile (r"[^\w\d-]+" )
38
38
UNDERSCORE_NAME_RE = re .compile (r"[^\w\d_]+" )
@@ -55,6 +55,18 @@ def hyphen_name(name: str) -> str:
55
55
def underscore_name (name : str ) -> str :
56
56
return UNDERSCORE_NAME_RE .sub ("_" , Utils .hyphen_name (name ))
57
57
58
+ @staticmethod
59
+ def wait_until (
60
+ is_done : Callable [[], bool ],
61
+ timeout_sec : int = TIMEOUT_SEC ,
62
+ poll_interval_sec : int = POLL_INTERVAL_SEC ,
63
+ ) -> bool :
64
+ for _ in range (0 , timeout_sec , poll_interval_sec ):
65
+ if is_done ():
66
+ return True
67
+ time .sleep (poll_interval_sec )
68
+ return False
69
+
58
70
@staticmethod
59
71
def storage_bucket (name : str ) -> str :
60
72
from google .cloud import storage
@@ -84,25 +96,40 @@ def bigquery_dataset(name: str, project: str = PROJECT) -> str:
84
96
85
97
bigquery_client = bigquery .Client ()
86
98
99
+ dataset_name = Utils .underscore_name (name )
87
100
dataset = bigquery_client .create_dataset (
88
- bigquery .Dataset (f"{ project } .{ Utils . underscore_name ( name ) } " )
101
+ bigquery .Dataset (f"{ project } .{ dataset_name } " )
89
102
)
90
103
91
104
logging .info (f"Created bigquery_dataset: { dataset .full_dataset_id } " )
92
- yield dataset . full_dataset_id
105
+ yield dataset_name
93
106
94
107
bigquery_client .delete_dataset (
95
- dataset . full_dataset_id . replace ( ":" , "." ) , delete_contents = True
108
+ f" { project } . { dataset_name } " , delete_contents = True
96
109
)
97
110
logging .info (f"Deleted bigquery_dataset: { dataset .full_dataset_id } " )
98
111
99
112
@staticmethod
100
- def bigquery_query (query : str ) -> Iterable [Dict [str , Any ]]:
113
+ def bigquery_table_exists (
114
+ dataset_name : str , table_name : str , project : str = PROJECT
115
+ ) -> bool :
116
+ from google .cloud import bigquery
117
+ from google .cloud .exceptions import NotFound
118
+
119
+ bigquery_client = bigquery .Client ()
120
+ try :
121
+ bigquery_client .get_table (f"{ project } .{ dataset_name } .{ table_name } " )
122
+ return True
123
+ except NotFound :
124
+ return False
125
+
126
+ @staticmethod
127
+ def bigquery_query (query : str , region : str = REGION ) -> Iterable [Dict [str , Any ]]:
101
128
from google .cloud import bigquery
102
129
103
130
bigquery_client = bigquery .Client ()
104
131
logging .info (f"Bigquery query: { query } " )
105
- for row in bigquery_client .query (query ):
132
+ for row in bigquery_client .query (query , location = region ):
106
133
yield dict (row )
107
134
108
135
@staticmethod
@@ -122,7 +149,7 @@ def pubsub_topic(name: str, project: str = PROJECT) -> str:
122
149
# https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492
123
150
cmd = ["gcloud" , "pubsub" , "--project" , project , "topics" , "delete" , topic .name ]
124
151
logging .info (f"{ cmd } " )
125
- subprocess .run (cmd , check = True )
152
+ subprocess .check_call (cmd )
126
153
logging .info (f"Deleted pubsub_topic: { topic .name } " )
127
154
128
155
@staticmethod
@@ -156,7 +183,7 @@ def pubsub_subscription(
156
183
subscription .name ,
157
184
]
158
185
logging .info (f"{ cmd } " )
159
- subprocess .run (cmd , check = True )
186
+ subprocess .check_call (cmd )
160
187
logging .info (f"Deleted pubsub_subscription: { subscription .name } " )
161
188
162
189
@staticmethod
@@ -207,7 +234,7 @@ def cloud_build_submit(
207
234
"""Sends a Cloud Build job, if an image_name is provided it will be deleted at teardown."""
208
235
cmd = ["gcloud" , "auth" , "configure-docker" ]
209
236
logging .info (f"{ cmd } " )
210
- subprocess .run (cmd , check = True )
237
+ subprocess .check_call (cmd )
211
238
212
239
if substitutions :
213
240
cmd_substitutions = [
@@ -229,7 +256,7 @@ def cloud_build_submit(
229
256
source ,
230
257
]
231
258
logging .info (f"{ cmd } " )
232
- subprocess .run (cmd , check = True )
259
+ subprocess .check_call (cmd )
233
260
logging .info (f"Cloud build finished successfully: { config } " )
234
261
yield f .read ()
235
262
except Exception as e :
@@ -247,7 +274,7 @@ def cloud_build_submit(
247
274
source ,
248
275
]
249
276
logging .info (f"{ cmd } " )
250
- subprocess .run (cmd , check = True )
277
+ subprocess .check_call (cmd )
251
278
logging .info (f"Created image: gcr.io/{ project } /{ image_name } :{ UUID } " )
252
279
yield f"{ image_name } :{ UUID } "
253
280
else :
@@ -265,9 +292,17 @@ def cloud_build_submit(
265
292
"--quiet" ,
266
293
]
267
294
logging .info (f"{ cmd } " )
268
- subprocess .run (cmd , check = True )
295
+ subprocess .check_call (cmd )
269
296
logging .info (f"Deleted image: gcr.io/{ project } /{ image_name } :{ UUID } " )
270
297
298
+ @staticmethod
299
+ def dataflow_job_url (
300
+ job_id : str ,
301
+ project : str = PROJECT ,
302
+ region : str = REGION ,
303
+ ) -> str :
304
+ return f"https://console.cloud.google.com/dataflow/jobs/{ region } /{ job_id } ?project={ project } "
305
+
271
306
@staticmethod
272
307
def dataflow_jobs_list (
273
308
project : str = PROJECT , page_size : int = 30
@@ -294,103 +329,83 @@ def dataflow_jobs_list(
294
329
yield job
295
330
296
331
@staticmethod
297
- def dataflow_jobs_get (
298
- job_id : Optional [str ] = None ,
299
- job_name : Optional [str ] = None ,
300
- project : str = PROJECT ,
301
- list_page_size : int = 30 ,
302
- ) -> Optional [Dict [str , Any ]]:
332
+ def dataflow_job_id (
333
+ job_name : str , project : str = PROJECT , list_page_size : int = LIST_PAGE_SIZE
334
+ ) -> str :
335
+ for job in Utils .dataflow_jobs_list (project , list_page_size ):
336
+ if job ["name" ] == job_name :
337
+ logging .info (f"Found Dataflow job: { job } " )
338
+ return job ["id" ]
339
+ raise ValueError (f"Dataflow job not found: job_name={ job_name } " )
340
+
341
+ @staticmethod
342
+ def dataflow_jobs_get (job_id : str , project : str = PROJECT ) -> Dict [str , Any ]:
303
343
from googleapiclient .discovery import build
304
344
305
345
dataflow = build ("dataflow" , "v1b3" )
306
346
307
- if job_id :
308
- # For more info see:
309
- # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get
310
- request = (
311
- dataflow .projects ()
312
- .jobs ()
313
- .get (
314
- projectId = project ,
315
- jobId = job_id ,
316
- view = "JOB_VIEW_SUMMARY" ,
317
- )
347
+ # For more info see:
348
+ # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get
349
+ request = (
350
+ dataflow .projects ()
351
+ .jobs ()
352
+ .get (
353
+ projectId = project ,
354
+ jobId = job_id ,
355
+ view = "JOB_VIEW_SUMMARY" ,
318
356
)
319
- # If the job is not found, this throws an HttpError exception.
320
- job = request .execute ()
321
- logging .info (f"Found Dataflow job: { job } " )
322
- return job
323
-
324
- elif job_name :
325
- for job in Utils .dataflow_jobs_list (project , list_page_size ):
326
- if job ["name" ] == job_name :
327
- logging .info (f"Found Dataflow job: { job } " )
328
- return job
329
- raise ValueError (f"Dataflow job not found: job_name={ job_name } " )
330
-
331
- else :
332
- raise ValueError ("must specify either `job_id` or `job_name`" )
357
+ )
358
+ # If the job is not found, this throws an HttpError exception.
359
+ return request .execute ()
333
360
334
361
@staticmethod
335
362
def dataflow_jobs_wait (
336
- job_id : Optional [ str ] = None ,
337
- job_name : Optional [ str ] = None ,
363
+ job_id : str = None ,
364
+ job_name : str = None ,
338
365
project : str = PROJECT ,
339
366
region : str = REGION ,
340
- until_status : str = "JOB_STATE_DONE" ,
341
- list_page_size : int = 100 ,
367
+ target_states : Set [ str ] = { "JOB_STATE_DONE" } ,
368
+ list_page_size : int = LIST_PAGE_SIZE ,
342
369
timeout_sec : str = TIMEOUT_SEC ,
343
370
poll_interval_sec : int = POLL_INTERVAL_SEC ,
344
371
) -> Optional [str ]:
345
372
"""For a list of all the valid states:
346
373
https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs#Job.JobState
347
374
"""
348
375
349
- # Wait until we reach the desired status, or the job finished in some way.
350
- target_status = {
351
- until_status ,
376
+ assert job_id or job_name , "required to pass either a job_id or a job_name"
377
+ if not job_id :
378
+ job_id = Utils .dataflow_job_id (job_name , project , list_page_size )
379
+
380
+ finish_states = {
352
381
"JOB_STATE_DONE" ,
353
382
"JOB_STATE_FAILED" ,
354
383
"JOB_STATE_CANCELLED" ,
355
384
"JOB_STATE_DRAINED" ,
356
385
}
357
386
logging .info (
358
- f"Waiting for Dataflow job until { target_status } : job_id={ job_id } , job_name={ job_name } "
387
+ f"Waiting for Dataflow job { job_id } until { target_states } \n "
388
+ + Utils .dataflow_job_url (job_id , project , region )
359
389
)
360
- status = None
361
- for _ in range ( 0 , timeout_sec , poll_interval_sec ) :
390
+
391
+ def job_is_done () -> bool :
362
392
try :
363
- job = Utils .dataflow_jobs_get (
364
- job_id = job_id ,
365
- job_name = job_name ,
366
- project = project ,
367
- list_page_size = list_page_size ,
368
- )
369
- status = job ["currentState" ]
370
- if status in target_status :
371
- logging .info (
372
- f"Job status { status } in { target_status } , done waiting"
373
- )
374
- return status
375
- elif status == "JOB_STATE_FAILED" :
393
+ job = Utils .dataflow_jobs_get (job_id , project )
394
+ state = job ["currentState" ]
395
+ if state in target_states :
396
+ logging .info (f"Dataflow job found with state { state } " )
397
+ return True
398
+ elif state in finish_states :
376
399
raise RuntimeError (
377
- "Dataflow job failed: \n "
378
- f"https://console.cloud.google.com/dataflow/jobs/ { region } / { job_id } ? project= { project } "
400
+ f "Dataflow job finished with state { state } , but we were expecting { target_states } \n "
401
+ + Utils . dataflow_job_url ( job_id , project , region )
379
402
)
380
- logging .info (
381
- f"Job status { status } not in { target_status } , retrying in { poll_interval_sec } seconds"
382
- )
403
+ return False
383
404
except Exception as e :
384
405
logging .exception (e )
385
- time .sleep (poll_interval_sec )
386
- if status is None :
387
- raise RuntimeError (
388
- f"Dataflow job not found: timeout_sec={ timeout_sec } , target_status={ target_status } , job_id={ job_id } , job_name={ job_name } "
389
- )
390
- else :
391
- raise RuntimeError (
392
- f"Dataflow job finished in status { status } but expected { target_status } : job_id={ job_id } , job_name={ job_name } "
393
- )
406
+ return False
407
+
408
+ Utils .wait_until (job_is_done , timeout_sec , poll_interval_sec )
394
409
395
410
@staticmethod
396
411
def dataflow_jobs_cancel (
@@ -416,10 +431,20 @@ def dataflow_jobs_cancel(
416
431
f"--region={ region } " ,
417
432
]
418
433
logging .info (f"{ cmd } " )
419
- subprocess .run (cmd , check = True )
434
+ subprocess .check_call (cmd )
420
435
421
436
# After draining the job, we must wait until the job has actually finished.
422
- Utils .dataflow_jobs_wait (job_id , project = project , region = region )
437
+ Utils .dataflow_jobs_wait (
438
+ job_id ,
439
+ target_states = {
440
+ "JOB_STATE_DONE" ,
441
+ "JOB_STATE_FAILED" ,
442
+ "JOB_STATE_CANCELLED" ,
443
+ "JOB_STATE_DRAINED" ,
444
+ },
445
+ project = project ,
446
+ region = region ,
447
+ )
423
448
424
449
else :
425
450
# https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/cancel
@@ -433,7 +458,7 @@ def dataflow_jobs_cancel(
433
458
f"--region={ region } " ,
434
459
]
435
460
logging .info (f"{ cmd } " )
436
- subprocess .run (cmd , check = True )
461
+ subprocess .check_call (cmd )
437
462
438
463
logging .info (f"Cancelled Dataflow job: { job_id } " )
439
464
@@ -459,7 +484,7 @@ def dataflow_flex_template_build(
459
484
f"--metadata-file={ metadata_file } " ,
460
485
]
461
486
logging .info (f"{ cmd } " )
462
- subprocess .run (cmd , check = True )
487
+ subprocess .check_call (cmd )
463
488
464
489
logging .info (f"dataflow_flex_template_build: { template_gcs_path } " )
465
490
yield template_gcs_path
@@ -497,32 +522,19 @@ def dataflow_flex_template_run(
497
522
]
498
523
logging .info (f"{ cmd } " )
499
524
500
- try :
501
- # The `capture_output` option was added in Python 3.7, so we must
502
- # pass the `stdout` and `stderr` options explicitly to support 3.6.
503
- # https://docs.python.org/3/library/subprocess.html#subprocess.run
504
- p = subprocess .run (
505
- cmd , check = True , stdout = subprocess .PIPE , stderr = subprocess .PIPE
506
- )
507
- stdout = p .stdout .decode ("utf-8" )
508
- stderr = p .stderr .decode ("utf-8" )
509
- logging .info (f"Launched Dataflow Flex Template job: { unique_job_name } " )
510
- except subprocess .CalledProcessError as e :
511
- logging .info (e , file = sys .stderr )
512
- stdout = e .stdout .decode ("utf-8" )
513
- stderr = e .stderr .decode ("utf-8" )
514
- finally :
515
- logging .info ("--- stderr ---" )
516
- logging .info (stderr )
517
- logging .info ("--- stdout ---" )
518
- logging .info (stdout )
519
- logging .info ("--- end ---" )
520
- return yaml .safe_load (stdout )["job" ]["id" ]
525
+ stdout = subprocess .check_output (cmd ).decode ("utf-8" )
526
+ logging .info (f"Launched Dataflow Flex Template job: { unique_job_name } " )
527
+ job_id = yaml .safe_load (stdout )["job" ]["id" ]
528
+ logging .info (f"Dataflow Flex Template job id: { job_id } " )
529
+ logging .info (f">> { Utils .dataflow_job_url (job_id , project , region )} " )
530
+ yield job_id
531
+
532
+ Utils .dataflow_jobs_cancel (job_id )
521
533
522
534
523
535
@pytest .fixture (scope = "session" )
524
536
def utils () -> Utils :
525
537
logging .getLogger ().setLevel (logging .INFO )
526
538
logging .info (f"Test unique identifier: { UUID } " )
527
- subprocess .run (["gcloud" , "version" ])
539
+ subprocess .check_call (["gcloud" , "version" ])
528
540
return Utils ()
0 commit comments