使用较大尺寸的图片进行图像风格转换时出现oom错误
来源:6-9 图像风格转换训练流程代码实现
Erlla
2019-06-18
传入content图像尺寸为 2000x1126 ,style图像尺寸 1500x1730
使用2070训练,8g显存.训练次数为15次,训练出一张图片后就报oom错误
贴出代码
import os
import math
import numpy as np
import tensorflow as tf
import time
from PIL import Image
VGG_MEAN = [103.939, 116.779, 123.68]
import imageio
class VGGNet:
def __init__(self, data_dict):
self.data_dict = data_dict # 载入数据
def get_conv_fliter(self, name): # 获取卷积层的参数
return tf.constant(self.data_dict[name][0],name='conv')
def get_fc_weight(self, name): # 获取全连接层的参数
return tf.constant(self.data_dict[name][0],name='fc')
def get_bias(self, name): # 获取name对应的偏置
return tf.constant(self.data_dict[name][1], name='bias')
def conv_layer(self, x, name): # 创建卷积层的函数
"""
cerate the conv layers , x is input
x:input image type [batch_number, width, length, channel]
name: name
"""
with tf.name_scope(name):
conv_w = self.get_conv_fliter(name) # 获取w参数
conv_b = self.get_bias(name) # 获取b参数
h = tf.nn.conv2d(x, conv_w, [1,1,1,1],padding='SAME') # [1,1,1,1]在各个维度上卷积的步长
h = tf.nn.bias_add(h, conv_b) # 对偏置进行相加
# 以上两个步骤代替了 tf.layer.conv2d()
h = tf.nn.relu(h)
return h
def pooling_layer(self,x,name): # 创建池化层的函数,池化层不需要任何的VGG16的参数
return tf.nn.max_pool(x,ksize=[1,2,2,1],
strides=[1,2,2,1],
padding='SAME',
name=name)
def fc_layer(self, x, name, activation=tf.nn.relu): # 创建全连接层的函数,可选择是否经过激活函数
with tf.name_scope(name):
fc_w = self.get_fc_weight(name)
fc_b = self.get_bias(name)
h = tf.matmul(x, fc_w)
h = tf.nn.bias_add(h, fc_b)
h = tf.nn.relu(h)
if activation is None:
return h
else:
return activation(h)
def flatten_layer(self, x, name):
with tf.name_scope(name):
# x_shape :[batchsize, width,height, channel]
x_shape = x.get_shape().as_list() #
dim = 1
for d in x_shape[1:]:
dim *= d # 将后三个连乘
x = tf.reshape(x, [-1,dim])
return x
def build(self, x_rgb):
start_time = time.time()
print('bulid model ......')
r,g,b = tf.split(x_rgb, [1,1,1], axis=3)
x_bgr = tf.concat([b - VGG_MEAN[0],
g - VGG_MEAN[1],
r - VGG_MEAN[2]], axis=3)
#assert x_bgr.get_shape().as_list()[1:] == [224,224,3]
# 重建神经网络
# stage 1
self.conv1_1 = self.conv_layer(x_bgr, 'conv1_1')
self.conv1_2 = self.conv_layer(self.conv1_1, 'conv1_2')
self.pool1 = self.pooling_layer(self.conv1_2,'pool1')
# stage 2
self.conv2_1 = self.conv_layer(self.pool1, 'conv2_1')
self.conv2_2 = self.conv_layer(self.conv2_1,'conv2_2')
self.pool2 = self.pooling_layer(self.conv2_2,'pool2')
# stage 3
self.conv3_1 = self.conv_layer(self.pool2, 'conv3_1')
self.conv3_2 = self.conv_layer(self.conv3_1, 'conv3_2')
self.conv3_3 = self.conv_layer(self.conv3_2, 'conv3_3')
self.pool3 = self.pooling_layer(self.conv3_3,'pool3')
# stage 4
self.conv4_1 = self.conv_layer(self.pool3, 'conv4_1')
self.conv4_2 = self.conv_layer(self.conv4_1, 'conv4_2')
self.conv4_3 = self.conv_layer(self.conv4_2, 'conv4_3')
self.pool4 = self.pooling_layer(self.conv4_3,'pool4')
# stage 5
self.conv5_1 = self.conv_layer(self.pool4, 'conv5_1')
self.conv5_2 = self.conv_layer(self.conv5_1,'conv5_2')
self.conv5_3 = self.conv_layer(self.conv5_2, 'conv5_3')
self.pool5 = self.pooling_layer(self.conv5_3,'pool5')
'''
# flatten
self.flatten5 = self.flatten_layer(self.pool5,'flatten')
#fully connect
self.fc6 = self.fc_layer(self.flatten5, 'fc6')
self.fc7 = self.fc_layer(self.fc6, 'fc7')
self.fc8 = self.fc_layer(self.fc7, 'fc8',activation=None) # 全连接层8无需经过激活函数
self.prob = tf.nn.softmax(self.fc8, name='prob')
'''
print('buliding finished : %4ds'%(time.time() - start_time))
vgg_16_npy_path = './vgg16.npy'
content_img_path = './source_image/content.jpg' # 内容特征图片路径
style_image_path = './source_image/style.png' # 风格特征图片路径
num_steps = 15 # 图像风格算法训练次数
learning_rate = 15 # 学习率
lambda_c = 0.1 # 内容损失函数系数
lambda_s = 500 # 风格损失函数系数
output_dir = './run_style_transfer' # 指定输出文件夹
if not os.path.exists(output_dir):
os.mkdir(output_dir)
def initial_result(shape,mean,stddev):
initial = tf.truncated_normal(shape,mean=mean,stddev=stddev)
return tf.Variable(initial)
def read_img(img_name):
img = Image.open(img_name)
np_img = np.array(img)
np_img = np.asarray([np_img],dtype=np.int32)
print(np_img.shape)
return np_img
# 定义一个计算gram 矩阵的函数
def gram_matrix(x):
"""
x: [1,width,height,channel]
"""
b,w,h,ch = x.get_shape().as_list()
features = tf.reshape(x, [b, h*w, ch])
gram = tf.matmul(features, features, adjoint_a=True) / tf.constant(ch * w * h, tf.float32) # adjoin_a=True
return gram
def get_image_shape(img_name):
img = Image.open(img_name)
np_img = np.array(img)
a = np_img.shape
img_width =a[0]
img_height = a[1]
print(np_img.shape)
return img_width,img_height
content_img_width,content_img_height = get_image_shape(content_img_path)
style_img_width,style_img_height = get_image_shape(style_image_path)
result = initial_result((1, content_img_width, content_img_height, 3), 127.5, 20) # 创建初始化图像,均值为127.5 方差为20
#读取风格图像和内容图像
content_val = read_img(content_img_path)
style_val = read_img(style_image_path)
content = tf.placeholder(tf.float32, shape=[1, content_img_width, content_img_height, 3])
style = tf.placeholder(tf.float32, shape=[1, style_img_width, style_img_height, 3])
data_dict = np.load(vgg_16_npy_path, encoding='latin1').item()
vgg_for_content = VGGNet(data_dict)
vgg_for_style = VGGNet(data_dict)
vgg_for_result = VGGNet(data_dict)
vgg_for_content.build(content)
vgg_for_style.build(style)
vgg_for_result.build(result)
content_feature = [
vgg_for_content.conv1_2,
# vgg_for_content.conv2_2,
# vgg_for_content.conv3_3,
# vgg_for_content.conv4_3,
# vgg_for_content.conv5_3
]
# 结果图像内容特征
result_content_feature = [
vgg_for_result.conv1_2,
# vgg_for_result.conv2_2,
# vgg_for_result.conv3_3,
# vgg_for_result.conv4_3,
# vgg_for_result.conv5_3
]
style_feature = [
# vgg_for_style.conv1_2,
# vgg_for_style.conv2_2,
#vgg_for_style.conv3_3,
vgg_for_style.conv4_3,
# vgg_for_style.conv5_3
]
style_gram = [gram_matrix(feature) for feature in style_feature]
# 结果图像风格特征
result_style_feature = [
# vgg_for_result.conv1_2,
# vgg_for_result.conv2_2,
#vgg_for_result.conv3_3,
vgg_for_result.conv4_3,
# vgg_for_result.conv5_3
]
#计算gram 用于计算风格损失
result_style_gram = \
[gram_matrix(feature) for feature in result_style_feature]
content_loss = tf.zeros(1, tf.float32)
for c, c_ in zip(content_feature, result_content_feature):
content_loss += tf.reduce_mean((c - c_) ** 2, [1,2,3])
style_loss = tf.zeros(1,tf.float32)
for s, s_ in zip(style_gram, result_style_gram):
style_loss += tf.reduce_mean((s - s_) ** 2, [1, 2])
loss = content_loss * lambda_c + style_loss * lambda_s
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
init_op = tf.global_variables_initializer() # 初始化一个op
with tf.Session() as sess:
sess.run(init_op) # 运行sess
for step in range(num_steps):
loss_value, content_loss_value, style_loss_value,_ =sess.run([loss, content_loss, style_loss, train_op],feed_dict={content:content_val,style:style_val})
print('step:%d, loss_value:%8.4f,content_value:%8.4f,style_loss:%8.4f' \
%(step+1, loss_value[0], content_loss_value[0], style_loss_value[0]))
result_img_path = os.path.join(output_dir,'result-%05d.jpg'%(step+1))
result_val = result.eval(session=sess)[0]
result_val = np.clip(result_val,0,255)
img_arr = np.asarray(result_val,np.uint8)
img = Image.fromarray(img_arr) # 转换为图片
img.save(result_img_path)# 保存图片
错误代码
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
E:\anaconda\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1291 try:
-> 1292 return fn(*args)
1293 except errors.OpError as e:
E:\anaconda\lib\site-packages\tensorflow\python\client\session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1276 return self._call_tf_sessionrun(
-> 1277 options, feed_dict, fetch_list, target_list, run_metadata)
1278
E:\anaconda\lib\site-packages\tensorflow\python\client\session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1366 self._session, options, feed_dict, fetch_list, target_list,
-> 1367 run_metadata)
1368
ResourceExhaustedError: OOM when allocating tensor with shape[1,64,1730,1500] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node conv1_2_1/Conv2D}} = Conv2D[T=DT_FLOAT, data_format="NCHW", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](conv1_1_1/Relu, conv1_2/conv)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
During handling of the above exception, another exception occurred:
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-5-84df424d3c3a> in <module>
5 sess.run(init_op) # 运行sess
6 for step in range(num_steps): # loss content_loss等为计算目标
----> 7 loss_value, content_loss_value, style_loss_value,_ =sess.run([loss, content_loss, style_loss, train_op],feed_dict={content:content_val,style:style_val})
8 print('step:%d, loss_value:%8.4f,content_value:%8.4f,style_loss:%8.4f' \
9 %(step+1, loss_value[0], content_loss_value[0], style_loss_value[0]))
E:\anaconda\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
885 try:
886 result = self._run(None, fetches, feed_dict, options_ptr,
--> 887 run_metadata_ptr)
888 if run_metadata:
889 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
E:\anaconda\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1108 if final_fetches or final_targets or (handle and feed_dict_tensor):
1109 results = self._do_run(handle, final_targets, final_fetches,
-> 1110 feed_dict_tensor, options, run_metadata)
1111 else:
1112 results = []
E:\anaconda\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1284 if handle is None:
1285 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1286 run_metadata)
1287 else:
1288 return self._do_call(_prun_fn, handle, feeds, fetches)
E:\anaconda\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1306 self._config.experimental.client_handles_error_formatting):
1307 message = error_interpolation.interpolate(message, self._graph)
-> 1308 raise type(e)(node_def, op, message)
1309
1310 def _extend_graph(self):
ResourceExhaustedError: OOM when allocating tensor with shape[1,64,1730,1500] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node conv1_2_1/Conv2D}} = Conv2D[T=DT_FLOAT, data_format="NCHW", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](conv1_1_1/Relu, conv1_2/conv)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
Caused by op 'conv1_2_1/Conv2D', defined at:
File "E:\anaconda\lib\runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "E:\anaconda\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "E:\anaconda\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "E:\anaconda\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
app.start()
File "E:\anaconda\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
self.io_loop.start()
File "E:\anaconda\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
self.asyncio_loop.run_forever()
File "E:\anaconda\lib\asyncio\base_events.py", line 528, in run_forever
self._run_once()
File "E:\anaconda\lib\asyncio\base_events.py", line 1764, in _run_once
handle._run()
File "E:\anaconda\lib\asyncio\events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "E:\anaconda\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
ret = callback()
File "E:\anaconda\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
return fn(*args, **kwargs)
File "E:\anaconda\lib\site-packages\tornado\gen.py", line 1233, in inner
self.run()
File "E:\anaconda\lib\site-packages\tornado\gen.py", line 1147, in run
yielded = self.gen.send(value)
File "E:\anaconda\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
yield gen.maybe_future(dispatch(*args))
File "E:\anaconda\lib\site-packages\tornado\gen.py", line 326, in wrapper
yielded = next(result)
File "E:\anaconda\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "E:\anaconda\lib\site-packages\tornado\gen.py", line 326, in wrapper
yielded = next(result)
File "E:\anaconda\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
user_expressions, allow_stdin,
File "E:\anaconda\lib\site-packages\tornado\gen.py", line 326, in wrapper
yielded = next(result)
File "E:\anaconda\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "E:\anaconda\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "E:\anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 2819, in run_cell
raw_cell, store_history, silent, shell_futures)
File "E:\anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 2845, in _run_cell
return runner(coro)
File "E:\anaconda\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
coro.send(None)
File "E:\anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3020, in run_cell_async
interactivity=interactivity, compiler=compiler, result=result)
File "E:\anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3185, in run_ast_nodes
if (yield from self.run_code(code, result)):
File "E:\anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3267, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-4-c6ef382594f4>", line 58, in <module>
vgg_for_style.build(style)
File "<ipython-input-2-717411ff30c8>", line 70, in build
self.conv1_2 = self.conv_layer(self.conv1_1, 'conv1_2')
File "<ipython-input-2-717411ff30c8>", line 23, in conv_layer
h = tf.nn.conv2d(x, conv_w, [1,1,1,1],padding='SAME') # [1,1,1,1]在各个维度上卷积的步长
File "E:\anaconda\lib\site-packages\tensorflow\python\ops\gen_nn_ops.py", line 1044, in conv2d
data_format=data_format, dilations=dilations, name=name)
File "E:\anaconda\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "E:\anaconda\lib\site-packages\tensorflow\python\util\deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "E:\anaconda\lib\site-packages\tensorflow\python\framework\ops.py", line 3272, in create_op
op_def=op_def)
File "E:\anaconda\lib\site-packages\tensorflow\python\framework\ops.py", line 1768, in __init__
self._traceback = tf_stack.extract_stack()
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1,64,1730,1500] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node conv1_2_1/Conv2D}} = Conv2D[T=DT_FLOAT, data_format="NCHW", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](conv1_1_1/Relu, conv1_2/conv)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
写回答
1回答
-
同学你好,这是因为你使用的图片太大所导致的。这个单GPU的话只能靠减小图片大小来解决了。不然你可以考虑跨GPU的模型,可以搜索一下tf.device的使用。
012020-04-16
相似问题