TensorFlow 2.x版训练脚本时间很长,能帮忙看看哪里有问题么?
来源:5-5 Train部分代码编写

weixin_慕用9360114
2024-12-18
import tensorflow as tf
from tensorflow.python import keras
from keras import layers, models, optimizers, callbacks
import readcifar10
import os
定义模型
def model(input_shape=(32, 32, 3), num_classes=10):
inputs = tf.keras.Input(shape=input_shape)
x = layers.Conv2D(32, (3, 3), padding=‘same’, activation=‘relu’)(inputs)
x = layers.Conv2D(32, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.Conv2D(32, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.Conv2D(64, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.Conv2D(64, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.Conv2D(128, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.Conv2D(128, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
##x = layers.Conv2D(256, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(1024, activation=‘relu’)(x)
x = layers.Dropout(0.2)(x) # 使用默认 dropout rate 0.2
outputs = layers.Dense(num_classes)(x)
model = models.Model(inputs=inputs, outputs=outputs)
return model
定义损失函数
def loss_fn(y_true, y_pred):
cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
regularization_loss = tf.reduce_sum(model.losses)
total_loss = cross_entropy(y_true, y_pred) + regularization_loss
return total_loss
定义优化器
def get_optimizer(batch_size):
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=0.01,
decay_steps=50000 // batch_size,
decay_rate=0.95,
staircase=False
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
return optimizer
训练函数
def train():
batch_size = 64
log_dir = 'logdirs-resnet’
model_dir = ‘model-resnet’
if not os.path.exists(log_dir):
os.makedirs(log_dir)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
# 数据读取
tr_dataset = readcifar10.read(batch_size, 0, 1)
te_dataset = readcifar10.read(batch_size, 1, 0)
# 模型定义
global model # 确保可以在 loss_fn 中访问到模型
model = model()
model.compile(optimizer=get_optimizer(batch_size),
loss=loss_fn,
metrics=['accuracy'])
# 回调函数设置
tensorboard_callback = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
checkpoint_callback = callbacks.ModelCheckpoint(
filepath=os.path.join(model_dir, "model.{epoch:02d}-{val_loss:.2f}.keras"),
save_best_only=True,
monitor='val_loss',
mode='min'
)
# 训练模型
model.fit(tr_dataset,
epochs=100,
validation_data=te_dataset,
callbacks=[tensorboard_callback, checkpoint_callback])
if name == ‘main’:
train()
1回答
-
会写代码的好厨师
2024-12-25
可以从这两个角度分析下,1.是不是换成1.13版本就可以。2看下是不是cuda没有启动成功,这种一般是tensorflow版本和cuda版本没对齐,或者是cuda版本不支持你的机器
00
相似问题