-
Notifications
You must be signed in to change notification settings - Fork 51
/
Copy pathtest_pretrained_models.py
152 lines (124 loc) · 6.04 KB
/
test_pretrained_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import sys
import unittest
from movinets import MoViNet
from movinets.config import _C
from io import BytesIO
import tensorflow as tf
import numpy as np
from six.moves import urllib
from PIL import Image
from einops import rearrange
import torch
import tensorflow as tf
import tensorflow_hub as hub
movinets=[_C.MODEL.MoViNetA0,
_C.MODEL.MoViNetA1,
_C.MODEL.MoViNetA2,
_C.MODEL.MoViNetA3,
_C.MODEL.MoViNetA4,
_C.MODEL.MoViNetA5,]
class TestPretrainedModels(unittest.TestCase):
def testBasePretrainedModels(self):
image_url = 'https://upload.wikimedia.org/wikipedia/commons/8/84/Ski_Famille_-_Family_Ski_Holidays.jpg'
image_height_l = [172,172,224,256,290,320]
image_width_l = [172,172,224,256,290,320]
inputs = tf.keras.layers.Input(
shape=[None, None, None, 3],
dtype=tf.float32)
f = open('/dev/null', 'w')
sys.stderr = f
for i in range(6):
image_width=image_width_l[i]
image_height=image_height_l[i]
with urllib.request.urlopen(image_url) as f:
image = Image.open(BytesIO(f.read())).resize((image_height, image_width))
video = tf.reshape(np.array(image), [1, 1, image_height, image_width, 3])
video = tf.cast(video, tf.float32) / 255.
video = tf.concat([video, video/2], axis=1)
video_2 = rearrange(torch.from_numpy(video.numpy()), "b t h w c-> b c t h w")
encoder = hub.KerasLayer(
f"https://tfhub.dev/tensorflow/movinet/a{i}/base/kinetics-600/classification/3")
# Important: due to a bug in the tf.nn.conv3d CPU implementation, we must
# compile with tf.function to enforce correct behavior. Otherwise, the output
# on CPU may be incorrect.
encoder.call = tf.function(encoder.call, experimental_compile=True)
# [batch_size, 600]
outputs = encoder(dict(image=inputs))
model_tf = tf.keras.Model(inputs, outputs)
output_tf = model_tf(video)
del model_tf
model = MoViNet(movinets[i],causal = False, pretrained = True )
model.eval();
with torch.no_grad():
model.clean_activation_buffers()
output = model(video_2)
del model
self.assertTrue(np.allclose(output.detach().numpy(),output_tf.numpy(),rtol=1e-06,atol=1e-4,))
def testStreamPretrainedModels(self):
image_url = 'https://upload.wikimedia.org/wikipedia/commons/8/84/Ski_Famille_-_Family_Ski_Holidays.jpg'
image_height_l = [172,172,224,256,290,320]
image_width_l = [172,172,224,256,290,320]
inputs = tf.keras.layers.Input(
shape=[None, None, None, 3],
dtype=tf.float32)
f = open('/dev/null', 'w')
sys.stderr = f
for i in range(3):
image_width=image_width_l[i]
image_height=image_height_l[i]
with urllib.request.urlopen(image_url) as f:
image = Image.open(BytesIO(f.read())).resize((image_height, image_width))
video = tf.reshape(np.array(image), [1, 1, image_height, image_width, 3])
video = tf.cast(video, tf.float32) / 255.
video = tf.concat([video, video/2, video/3], axis=1)
video_2 = rearrange(torch.from_numpy(video.numpy()), "b t h w c-> b c t h w")
encoder = hub.KerasLayer(
f"https://tfhub.dev/tensorflow/movinet/a{i}/stream/kinetics-600/classification/3")
image_input = tf.keras.layers.Input(
shape=[None, None, None, 3],
dtype=tf.float32,
name='image')
# Define the state inputs, which is a dict that maps state names to tensors.
init_states_fn = encoder.resolved_object.signatures['init_states']
state_shapes = {
name: ([s if s > 0 else None for s in state.shape], state.dtype)
for name, state in init_states_fn(tf.constant([0, 0, 0, 0, 3])).items()
}
states_input = {
name: tf.keras.Input(shape[1:], dtype=dtype, name=name)
for name, (shape, dtype) in state_shapes.items()
}
# The inputs to the model are the states and the video
inputs = {**states_input, 'image': image_input}
outputs = encoder(inputs)
model_tf = tf.keras.Model(inputs, outputs, name='movinet')
# Split the video into individual frames.
# Note: we can also split into larger clips as well (e.g., 8-frame clips).
# Running on larger clips will slightly reduce latency overhead, but
# will consume more memory.
frames = tf.split(video, video.shape[1], axis=1)
# Initialize the dict of states. All state tensors are initially zeros.
init_states = init_states_fn(tf.shape(video))
# Run the model prediction by looping over each frame.
states = init_states
predictions = []
for frame in frames:
output, states = model_tf({**states, 'image': frame})
predictions.append(output)
# The video classification will simply be the last output of the model.
output_tf = predictions[-1]
del model_tf
model = MoViNet(movinets[i], causal = True, pretrained = True)
model.eval();
with torch.no_grad():
model.clean_activation_buffers()
output = model(video_2)
model.clean_activation_buffers()
_ = model(video_2[:,:,:1])
_ = model(video_2[:,:,1:2])
output_2 = model(video_2[:,:,2:3])
del model
self.assertTrue(np.allclose(output.detach().numpy(),output_2.numpy(),rtol=1e-06,atol=1e-4,))
self.assertTrue(np.allclose(output.detach().numpy(),output_tf.numpy(),rtol=1e-06,atol=1e-4,))
if __name__ == '__main__':
unittest.main()