@@ -78,7 +78,7 @@ DLDataType RAI_GetDLDataTypeFromTF(TF_DataType dtype) {
78
78
return (DLDataType ){ .bits = 0 };
79
79
}
80
80
81
- RAI_Tensor * RAI_TensorCreateFromTFTensor (TF_Tensor * tensor ) {
81
+ RAI_Tensor * RAI_TensorCreateFromTFTensor (TF_Tensor * tensor , size_t batch_offset , size_t batch_size ) {
82
82
RAI_Tensor * ret = RedisModule_Calloc (1 , sizeof (* ret ));
83
83
84
84
DLContext ctx = (DLContext ){
@@ -88,25 +88,30 @@ RAI_Tensor* RAI_TensorCreateFromTFTensor(TF_Tensor *tensor) {
88
88
89
89
size_t ndims = TF_NumDims (tensor );
90
90
91
+ int64_t total_batch_size = TF_Dim (tensor , 0 );
92
+
91
93
int64_t * shape = RedisModule_Calloc (ndims , sizeof (* shape ));
92
94
int64_t * strides = RedisModule_Calloc (ndims , sizeof (* strides ));
93
95
for (int64_t i = 0 ; i < ndims ; ++ i ) {
94
96
shape [i ] = TF_Dim (tensor , i );
95
97
strides [i ] = 1 ;
96
98
}
99
+ shape [0 ] = batch_size ;
97
100
for (int64_t i = ndims - 2 ; i >= 0 ; -- i ) {
98
101
strides [i ] *= strides [i + 1 ] * shape [i + 1 ];
99
102
}
100
103
104
+ size_t sample_bytesize = TF_TensorByteSize (tensor ) / total_batch_size ;
105
+
101
106
// FIXME: In TF, RunSession allocates memory for output tensors
102
107
// This means that we either memcpy the tensor data and let
103
108
// Redis be responsible for the memory, or we reuse the TF
104
109
// allocated memory, which might not be optimal down the road
105
110
// Note: on YOLO this has no impact on perf
106
111
#ifdef RAI_COPY_RUN_OUTPUT
107
- size_t len = TF_TensorByteSize ( tensor ) ;
112
+ size_t len = sample_bytesize * batch_size ;
108
113
char * data = RedisModule_Calloc (len , sizeof (* data ));
109
- memcpy (data , TF_TensorData (tensor ), len );
114
+ memcpy (data , TF_TensorData (tensor ) + sample_bytesize * batch_offset , len );
110
115
#endif
111
116
112
117
// TODO: use manager_ctx to ensure TF tensor doesn't get deallocated
@@ -160,6 +165,64 @@ TF_Tensor* RAI_TFTensorFromTensor(RAI_Tensor* t){
160
165
#endif /* RAI_COPY_RUN_INPUT */
161
166
}
162
167
168
+ TF_Tensor * RAI_TFTensorFromTensors (RAI_Tensor * * ts , size_t count ){
169
+
170
+ if (count == 0 ) {
171
+ return NULL ;
172
+ }
173
+
174
+ size_t batch_size = 0 ;
175
+
176
+ for (size_t i = 0 ; i < count ; i ++ ) {
177
+ batch_size += ts [i ]-> tensor .dl_tensor .shape [0 ];
178
+ }
179
+
180
+ RAI_Tensor * t0 = ts [0 ];
181
+
182
+ int ndim = t0 -> tensor .dl_tensor .ndim ;
183
+ int64_t batched_shape [ndim ];
184
+
185
+ for (size_t i = 0 ; i < ndim ; i ++ ) {
186
+ batched_shape [i ] = t0 -> tensor .dl_tensor .shape [i ];
187
+ }
188
+
189
+ batched_shape [0 ] = batch_size ;
190
+
191
+ TF_Tensor * out = TF_AllocateTensor (
192
+ RAI_GetTFDataTypeFromDL (t0 -> tensor .dl_tensor .dtype ),
193
+ batched_shape ,
194
+ t0 -> tensor .dl_tensor .ndim ,
195
+ RAI_TensorByteSize (t0 ));
196
+
197
+ size_t offset = 0 ;
198
+ for (size_t i = 0 ; i < count ; i ++ ) {
199
+ size_t tbytesize = RAI_TensorByteSize (ts [i ]);
200
+ memcpy (TF_TensorData (out ) + offset , ts [i ]-> tensor .dl_tensor .data , tbytesize );
201
+ offset += tbytesize ;
202
+ }
203
+
204
+ return out ;
205
+
206
+ // #ifdef RAI_COPY_RUN_INPUT
207
+ // TF_Tensor* out = TF_AllocateTensor(
208
+ // RAI_GetTFDataTypeFromDL(t->tensor.dl_tensor.dtype),
209
+ // t->tensor.dl_tensor.shape,
210
+ // t->tensor.dl_tensor.ndim,
211
+ // RAI_TensorByteSize(t));
212
+ // memcpy(TF_TensorData(out), t->tensor.dl_tensor.data, TF_TensorByteSize(out));
213
+ // return out;
214
+ // #else
215
+ // return TF_NewTensor(
216
+ // RAI_GetTFDataTypeFromDL(t->tensor.dl_tensor.dtype),
217
+ // t->tensor.dl_tensor.shape,
218
+ // t->tensor.dl_tensor.ndim,
219
+ // t->tensor.dl_tensor.data,
220
+ // RAI_TensorByteSize(t),
221
+ // &RAI_TFDeallocator,
222
+ // NULL);
223
+ // #endif /* RAI_COPY_RUN_INPUT */
224
+ }
225
+
163
226
164
227
RAI_Model * RAI_ModelCreateTF (RAI_Backend backend , const char * devicestr ,
165
228
size_t ninputs , const char * * inputs ,
@@ -359,17 +422,41 @@ void RAI_ModelFreeTF(RAI_Model* model, RAI_Error* error) {
359
422
360
423
int RAI_ModelRunTF (RAI_ModelRunCtx * mctx , RAI_Error * error ) {
361
424
TF_Status * status = TF_NewStatus ();
362
- const size_t ninputs = array_len (mctx -> inputs );
363
- const size_t noutputs = array_len (mctx -> outputs );
425
+ const size_t nbatches = array_len (mctx -> batches );
426
+
427
+ if (nbatches == 0 ) {
428
+ return 1 ;
429
+ }
430
+
431
+ const size_t ninputs = array_len (mctx -> batches [0 ].inputs );
432
+ const size_t noutputs = array_len (mctx -> batches [0 ].outputs );
364
433
TF_Tensor * inputTensorsValues [ninputs ];
365
434
TF_Output inputs [ninputs ];
366
435
TF_Tensor * outputTensorsValues [noutputs ];
367
436
TF_Output outputs [noutputs ];
368
437
369
- for (size_t i = 0 ; i < ninputs ; ++ i ) {
370
- inputTensorsValues [i ] = RAI_TFTensorFromTensor (mctx -> inputs [i ].tensor );
438
+ size_t batch_sizes [nbatches ];
439
+ size_t batch_offsets [nbatches ];
440
+ if (array_len (mctx -> batches [0 ].inputs ) > 0 ) {
441
+ for (size_t b = 0 ; b < nbatches ; ++ b ) {
442
+ batch_sizes [b ] = RAI_TensorDim (mctx -> batches [b ].inputs [0 ].tensor , 0 );
443
+ }
444
+ batch_offsets [0 ] = 0 ;
445
+ for (size_t b = 1 ; b < nbatches ; ++ b ) {
446
+ batch_offsets [b ] = batch_sizes [b - 1 ];
447
+ }
448
+ }
449
+
450
+ for (size_t i = 0 ; i < ninputs ; ++ i ) {
451
+ RAI_Tensor * batched_input_tensors [nbatches ];
452
+
453
+ for (size_t b = 0 ; b < nbatches ; ++ b ) {
454
+ batched_input_tensors [b ] = mctx -> batches [b ].inputs [i ].tensor ;
455
+ }
456
+ // inputTensorsValues[i] = RAI_TFTensorFromTensor(mctx->inputs[i].tensor);
457
+ inputTensorsValues [i ] = RAI_TFTensorFromTensors (batched_input_tensors , nbatches );
371
458
TF_Output port ;
372
- port .oper = TF_GraphOperationByName (mctx -> model -> model , mctx -> inputs [i ].name );
459
+ port .oper = TF_GraphOperationByName (mctx -> model -> model , mctx -> batches [ 0 ]. inputs [i ].name );
373
460
port .index = 0 ;
374
461
if (port .oper == NULL ){
375
462
return 1 ;
@@ -379,7 +466,7 @@ int RAI_ModelRunTF(RAI_ModelRunCtx* mctx, RAI_Error *error) {
379
466
380
467
for (size_t i = 0 ; i < noutputs ; ++ i ) {
381
468
TF_Output port ;
382
- port .oper = TF_GraphOperationByName (mctx -> model -> model , mctx -> outputs [i ].name );
469
+ port .oper = TF_GraphOperationByName (mctx -> model -> model , mctx -> batches [ 0 ]. outputs [i ].name );
383
470
port .index = 0 ;
384
471
if (port .oper == NULL ){
385
472
return 1 ;
@@ -406,11 +493,13 @@ int RAI_ModelRunTF(RAI_ModelRunCtx* mctx, RAI_Error *error) {
406
493
return 1 ;
407
494
}
408
495
409
- for (size_t i = 0 ; i < noutputs ; ++ i ) {
410
- RAI_Tensor * output_tensor = RAI_TensorCreateFromTFTensor (outputTensorsValues [i ]);
496
+ for (size_t i = 0 ; i < noutputs ; ++ i ) {
497
+ for (size_t b = 0 ; b < nbatches ; b ++ ) {
498
+ RAI_Tensor * output_tensor = RAI_TensorCreateFromTFTensor (outputTensorsValues [i ], batch_offsets [b ], batch_sizes [b ]);
499
+ mctx -> batches [b ].outputs [i ].tensor = RAI_TensorGetShallowCopy (output_tensor );
500
+ RAI_TensorFree (output_tensor );
501
+ }
411
502
TF_DeleteTensor (outputTensorsValues [i ]);
412
- mctx -> outputs [i ].tensor = RAI_TensorGetShallowCopy (output_tensor );
413
- RAI_TensorFree (output_tensor );
414
503
}
415
504
416
505
// TODO: add (make sure we deallocate once)
0 commit comments