|
13 | 13 | """
|
14 | 14 | import os
|
15 | 15 | import time
|
| 16 | +import numpy as np |
16 | 17 | from argparse import ArgumentParser
|
17 | 18 |
|
18 | 19 | import torch
|
19 | 20 | from transformers import AutoTokenizer, AutoModelForCausalLM
|
20 | 21 |
|
21 | 22 | from openvino.inference_engine import IECore
|
22 | 23 |
|
| 24 | +def generate_greedy_pytorch(tokens, model, n): |
| 25 | + complete_seq = tokens.permute((1, 0)).tolist() |
| 26 | + for _ in range(n): |
| 27 | + out = model(tokens) |
| 28 | + next_tokens = torch.argmax(out.logits[:, -1], dim = -1).unsqueeze(1) |
| 29 | + tokens = torch.cat([tokens, next_tokens], dim=-1) |
| 30 | + tokens = tokens[:, 1:] |
| 31 | + complete_seq.extend(next_tokens.tolist()) |
| 32 | + return np.array(complete_seq).T.tolist() |
| 33 | + |
| 34 | + |
| 35 | +def generate_greedy_openvino(tokens, exec_net, n, logits_dict_key = "2859"): |
| 36 | + complete_seq = tokens.T.tolist() |
| 37 | + for _ in range(n): |
| 38 | + out = exec_net.infer(inputs={"0": inputs})[logits_dict_key] |
| 39 | + next_tokens = np.argmax(out[:, -1], axis=-1).reshape(-1, 1) |
| 40 | + tokens = np.hstack((tokens, next_tokens)) |
| 41 | + tokens = tokens[:, 1:] |
| 42 | + complete_seq.extend(next_tokens.tolist()) |
| 43 | + return np.array(complete_seq).T.tolist() |
| 44 | + |
| 45 | + |
23 | 46 | if __name__ == '__main__':
|
24 | 47 | parser = ArgumentParser()
|
25 |
| - parser.add_argument("--model", help="Path to an .xml file with a trained model.", default = "gpt2.xml", type=str) |
| 48 | + parser.add_argument("--model", help="Path to an .xml file with a trained model.", default = "./gpt2.xml", type=str) |
| 49 | + parser.add_argument("--g", help="if set model will also test generation", action = "store_true", default = False) |
| 50 | + args = parser.parse_args() |
26 | 51 |
|
27 | 52 | print("-"*70)
|
28 | 53 | print("Loading Pytorch model")
|
29 | 54 | tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
30 | 55 | model = AutoModelForCausalLM.from_pretrained("gpt2")
|
31 | 56 | with open("text.en", "r") as f:
|
32 | 57 | text = f.read()
|
33 |
| - input_encoder = tokenizer([text for _ in range(100)], return_tensors="pt") |
34 |
| - print(":: -->", input_encoder["input_ids"].size()) |
| 58 | + input_encoder = tokenizer([text + tokenizer.eos_token for _ in range(1)], return_tensors="pt") |
| 59 | + print("Text shape:", input_encoder["input_ids"].size()) |
35 | 60 |
|
36 | 61 | st = time.time()
|
37 | 62 | model(input_encoder[ "input_ids"])
|
38 |
| - print(f"Pytorch inference in {time.time() - st:.5f}s") |
39 |
| - del model, tokenizer |
| 63 | + print(f":: Pytorch inference in {time.time() - st:.5f}s") |
| 64 | + if args.g: |
| 65 | + print("-"*70) |
| 66 | + print("Testing generation") |
| 67 | + st = time.time() |
| 68 | + out = generate_greedy_pytorch(input_encoder["input_ids"], model, n = 40) |
| 69 | + out = tokenizer.decode(out[0]) |
| 70 | + print(f":: Pytorch generation took (40 steps): {time.time() - st:.3f}s") |
| 71 | + del model |
40 | 72 |
|
41 |
| - args = parser.parse_args() |
42 | 73 | print("-"*70)
|
43 | 74 | model_xml = args.model
|
44 | 75 | model_bin = os.path.splitext(model_xml)[0] + ".bin"
|
|
54 | 85 | print("Loading IR to the plugin...")
|
55 | 86 | exec_net = ie.load_network(network=net, device_name="CPU", num_requests=2)
|
56 | 87 | print(f"exec_net: {exec_net}")
|
57 |
| - print("-"*70) |
58 | 88 |
|
59 | 89 | # this is a bit tricky. So the input to the model is the input from ONNX graph
|
60 | 90 | # IECore makes a networkX graph of the "computation graph" and when we run .infer
|
|
63 | 93 | # suspect. Happy Hunting!
|
64 | 94 | inputs = input_encoder["input_ids"].tolist()
|
65 | 95 | st = time.time()
|
66 |
| - out = exec_net.infer(inputs={"0": [1 for _ in range(127)]}) |
67 |
| - print(f"OpenVino inference in {time.time() - st:.5f}s") |
| 96 | + out = exec_net.infer(inputs={"0": inputs}, ) |
| 97 | + |
| 98 | + # now this out is a dictionary and has a lot of outputs so you will need to manually |
| 99 | + # determine which is the output that you want by checking the correct shape |
| 100 | + # for k in list(out.keys()): |
| 101 | + # print(k, "-->", out[k].shape) |
| 102 | + |
| 103 | + print(f":: OpenVino inference in {time.time() - st:.5f}s") |
| 104 | + |
| 105 | + if args.g: |
| 106 | + print("-"*70) |
| 107 | + print("Testing generation") |
| 108 | + st = time.time() |
| 109 | + out = generate_greedy_openvino(input_encoder["input_ids"].numpy(), exec_net, n=40) |
| 110 | + out = tokenizer.decode(out[0]) |
| 111 | + print(f":: OpenVino generation took (40 steps): {time.time() - st:.3f}s") |
| 112 | + |
68 | 113 | print("-"*70)
|
0 commit comments