TensorFlow 2.x版训练脚本时间很长,能帮忙看看哪里有问题么?

来源:5-5 Train部分代码编写

weixin_慕用9360114

2024-12-18

import tensorflow as tf
from tensorflow.python import keras
from keras import layers, models, optimizers, callbacks
import readcifar10
import os

定义模型

def model(input_shape=(32, 32, 3), num_classes=10):
inputs = tf.keras.Input(shape=input_shape)
x = layers.Conv2D(32, (3, 3), padding=‘same’, activation=‘relu’)(inputs)
x = layers.Conv2D(32, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.Conv2D(32, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.Conv2D(64, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.Conv2D(64, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.Conv2D(128, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.Conv2D(128, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
##x = layers.Conv2D(256, (3, 3), padding=‘same’, activation=‘relu’)(x)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(1024, activation=‘relu’)(x)
x = layers.Dropout(0.2)(x) # 使用默认 dropout rate 0.2
outputs = layers.Dense(num_classes)(x)

model = models.Model(inputs=inputs, outputs=outputs)
return model

定义损失函数

def loss_fn(y_true, y_pred):
cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
regularization_loss = tf.reduce_sum(model.losses)
total_loss = cross_entropy(y_true, y_pred) + regularization_loss
return total_loss

定义优化器

def get_optimizer(batch_size):
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=0.01,
decay_steps=50000 // batch_size,
decay_rate=0.95,
staircase=False
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
return optimizer

训练函数

def train():
batch_size = 64
log_dir = 'logdirs-resnet’
model_dir = ‘model-resnet’

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# 数据读取
tr_dataset = readcifar10.read(batch_size, 0, 1)
te_dataset = readcifar10.read(batch_size, 1, 0)

# 模型定义
global model  # 确保可以在 loss_fn 中访问到模型
model = model()
model.compile(optimizer=get_optimizer(batch_size),
              loss=loss_fn,
              metrics=['accuracy'])

# 回调函数设置
tensorboard_callback = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=os.path.join(model_dir, "model.{epoch:02d}-{val_loss:.2f}.keras"),
    save_best_only=True,
    monitor='val_loss',
    mode='min'
)

# 训练模型
model.fit(tr_dataset,
          epochs=100,
          validation_data=te_dataset,
          callbacks=[tensorboard_callback, checkpoint_callback])

if name == ‘main’:
train()

写回答

1回答

会写代码的好厨师

2024-12-25

可以从这两个角度分析下,1.是不是换成1.13版本就可以。2看下是不是cuda没有启动成功,这种一般是tensorflow版本和cuda版本没对齐,或者是cuda版本不支持你的机器

0
0

Python3+TensorFlow打造人脸识别智能小程序

理论与实战项目双管齐下,让AI技术真正落地应用,适合毕设展示。

1086 学习 · 538 问题

查看课程