update with generation testing

yashbonde · yashbonde · commit f8be997ca23b · 2021-02-11T10:41:48.000+05:30
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+.python-version
+bin/
+gpt2.*
+lib/
+pyvenv.cfg
+.DS_Store
diff --git a/README.md b/README.md
@@ -61,8 +61,9 @@ gpt2.mapping
 gpt2.xml
 ```
 
-## Checks
+## Tests
 
+#### Local Machine
 To check if everything works fine run the script `run.py`. You should start seeing the outputs, the following is on the machine with following configuration:
 ```
 MacBook Pro (13-inch, 2020, Four Thunderbolt 3 ports)
@@ -74,17 +75,39 @@ The performance results are as follows (`2x` boost):
 ```
 ----------------------------------------------------------------------
 Loading Pytorch model
-Pytorch inference in 0.59065s
+:: Pytorch inference in 0.59065s
 ----------------------------------------------------------------------
 Creating Inference Engine...
 Loading network
 Loading IR to the plugin...
 exec_net: <openvino.inference_engine.ie_api.ExecutableNetwork object at 0x12c531fb0>
+:: OpenVino inference in 0.26206s
 ----------------------------------------------------------------------
-OpenVino inference in 0.26206s
+```
+
+In order to test generation capabilities you can pass `--g` flag and get the following results:
+```
+----------------------------------------------------------------------
+Loading Pytorch model
+Text shape: torch.Size([1, 127])
+:: Pytorch inference in 0.46476s
+----------------------------------------------------------------------
+Testing generation
+:: Pytorch generation took (40 steps): 17.663s
+----------------------------------------------------------------------
+Creating Inference Engine...
+Loading network
+Loading IR to the plugin...
+exec_net: <openvino.inference_engine.ie_api.ExecutableNetwork object at 0x130aaffb0>
+:: OpenVino inference in 0.23262s
+----------------------------------------------------------------------
+Testing generation
+:: OpenVino generation took (40 steps): 6.220s
 ----------------------------------------------------------------------
 ```
 
+#### Cloud Server
+
 When running on AWS `c5.12xlarge` and batching the data to `128` samples in a batch we see larger performance increase.
 ```
 ----------------------------------------------------------------------
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,21 @@
+certifi==2020.12.5
+chardet==4.0.0
+click==7.1.2
+filelock==3.0.12
+idna==2.10
+importlib-metadata==3.4.0
+joblib==1.0.1
+numpy==1.20.1
+packaging==20.9
+pyparsing==2.4.7
+regex==2020.11.13
+requests==2.25.1
+sacremoses==0.0.43
+six==1.15.0
+tokenizers==0.10.1
+torch==1.7.1
+tqdm==4.56.2
+transformers==4.3.2
+typing-extensions==3.7.4.3
+urllib3==1.26.3
+zipp==3.4.0
diff --git a/run.py b/run.py
@@ -13,32 +13,63 @@
 """
 import os
 import time
+import numpy as np
 from argparse import ArgumentParser
 
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 from openvino.inference_engine import IECore
 
+def generate_greedy_pytorch(tokens, model, n):
+  complete_seq = tokens.permute((1, 0)).tolist()
+  for _ in range(n):
+    out = model(tokens)
+    next_tokens = torch.argmax(out.logits[:, -1], dim = -1).unsqueeze(1)
+    tokens = torch.cat([tokens, next_tokens], dim=-1)
+    tokens = tokens[:, 1:]
+    complete_seq.extend(next_tokens.tolist())
+  return np.array(complete_seq).T.tolist()
+
+
+def generate_greedy_openvino(tokens, exec_net, n, logits_dict_key = "2859"):
+  complete_seq = tokens.T.tolist()
+  for _ in range(n):
+    out = exec_net.infer(inputs={"0": inputs})[logits_dict_key]
+    next_tokens = np.argmax(out[:, -1], axis=-1).reshape(-1, 1)
+    tokens = np.hstack((tokens, next_tokens))
+    tokens = tokens[:, 1:]
+    complete_seq.extend(next_tokens.tolist())
+  return np.array(complete_seq).T.tolist()
+
+
 if __name__ == '__main__':
   parser = ArgumentParser()
-  parser.add_argument("--model", help="Path to an .xml file with a trained model.", default = "gpt2.xml", type=str)
+  parser.add_argument("--model", help="Path to an .xml file with a trained model.", default = "./gpt2.xml", type=str)
+  parser.add_argument("--g", help="if set model will also test generation", action = "store_true", default = False)
+  args = parser.parse_args()
 
   print("-"*70)
   print("Loading Pytorch model")
   tokenizer = AutoTokenizer.from_pretrained("gpt2")
   model = AutoModelForCausalLM.from_pretrained("gpt2")
   with open("text.en", "r") as f:
     text = f.read()
-  input_encoder = tokenizer([text for _ in range(100)], return_tensors="pt")
-  print(":: -->", input_encoder["input_ids"].size())
+  input_encoder = tokenizer([text + tokenizer.eos_token for _ in range(1)], return_tensors="pt")
+  print("Text shape:", input_encoder["input_ids"].size())
 
   st = time.time()
   model(input_encoder[ "input_ids"])
-  print(f"Pytorch inference in {time.time() - st:.5f}s")
-  del model, tokenizer
+  print(f":: Pytorch inference in {time.time() - st:.5f}s")
+  if args.g:
+    print("-"*70)
+    print("Testing generation")
+    st = time.time()
+    out = generate_greedy_pytorch(input_encoder["input_ids"], model, n = 40)
+    out = tokenizer.decode(out[0])
+    print(f":: Pytorch generation took (40 steps): {time.time() - st:.3f}s")
+  del model
 
-  args = parser.parse_args()
   print("-"*70)
   model_xml = args.model
   model_bin = os.path.splitext(model_xml)[0] + ".bin"
@@ -54,7 +85,6 @@
   print("Loading IR to the plugin...")
   exec_net = ie.load_network(network=net, device_name="CPU", num_requests=2)
   print(f"exec_net: {exec_net}")
-  print("-"*70)
 
   # this is a bit tricky. So the input to the model is the input from ONNX graph
   # IECore makes a networkX graph of the "computation graph" and when we run .infer
@@ -63,6 +93,21 @@
   # suspect. Happy Hunting!
   inputs = input_encoder["input_ids"].tolist()
   st = time.time()
-  out = exec_net.infer(inputs={"0": [1 for _ in range(127)]})
-  print(f"OpenVino inference in {time.time() - st:.5f}s")
+  out = exec_net.infer(inputs={"0": inputs}, )
+  
+  # now this out is a dictionary and has a lot of outputs so you will need to manually
+  # determine which is the output that you want by checking the correct shape
+  # for k in list(out.keys()):
+  #   print(k, "-->", out[k].shape)
+
+  print(f":: OpenVino inference in {time.time() - st:.5f}s")
+
+  if args.g:
+    print("-"*70)
+    print("Testing generation")
+    st = time.time()
+    out = generate_greedy_openvino(input_encoder["input_ids"].numpy(), exec_net, n=40)
+    out = tokenizer.decode(out[0])
+    print(f":: OpenVino generation took (40 steps): {time.time() - st:.3f}s")
+
   print("-"*70)