The Effects of the Learning Rate on Model Performance

5 min readSep 20, 2020

Parameter and Hyperparameter

Parameter จะเป็นตัวแปร (Variable) ที่อยู่ภายใน Model

Hyperparameter จะเป็นตัวแปรภายนอก Model เช่น Learning Rate, Droupout Rate, Batch Size ฯลฯ ซึ่งไม่ได้เกิดจากการประมาณค่าโดยตรงจาก Dataset

Learning Rate

Learning Rate คือ Hyperparameter ที่สำคัญที่มีหน้าที่ในการปรับขนาดของ Error ในแต่ครั้งของการปรับปรุง Weight และ Bias โดยที่ค่า Learning Rate น้อย ค่า Weight ก็จะเปลี่ยนแปลงน้อย Model จะไม่ค่อยเปลี่ยนแปลง แต่ถ้า Learning Rate มาก ค่า Weight ก็จะเปลี่ยนแปลงมากเช่นกัน Model ก็จะมีการเปลี่ยนแปลงมากขึ้น ดังนั้น Learning Rate จึงมีผลกระทบกับประสิทธิภาพของ Model มากที่สุด

บทความนี้จะได้ทำความเข้าใจผลลัพธ์ที่เกิดจากการปรับเปลี่ยนค่า Learning Rate ด้วยวิธีการต่างๆ

Impact of Learning Rate

Import Library ที่ต้องการจะใช้

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Flatten
from tensorflow.keras.datasets import mnist
from sklearn.datasets import make_blobs
from matplotlib import pyplot
from numpy import where
from sklearn.model_selection import train_test_split
import pandas as pd
import plotly.express as px
import tensorflow as tf
import numpy as np

โหลด Data MNIST เข้ามาและแบ่ง Dataset เป็น 2 ส่วน สำหรับการ Train และ Test

(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = np.array([X.flatten() for X in X_train])
X_test = np.array([X.flatten() for X in X_test])
X_train.shape, y_train.shape, X_test.shape, y_test.shape

นำ Dataset ส่วนที่ Train มาแปลงเป็น DataFrame โดยเปลี่ยนชนิดข้อมูลใน Column “class” เป็น String

X_train_pd = pd.DataFrame(X_train)
y_train_pd = pd.DataFrame(y_train, columns=['class'])
df = pd.concat([X_train_pd, y_train_pd], axis=1)
df["class"] = df["class"].astype(str)

เราจะทำการเข้ารหัสผลเฉลย แบบ One-Hot Encoder เมื่อ Model มีการทำนาย ว่าเป็น Class ไหนก็จะส่งค่าความมั่นใจ (Confidence) กลับมา

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

นิยาม Model, Complie Model และ Plot Accuracy

def fit_model(trainX, trainy, testX, testy, lrate):
    model = Sequential()
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(units = 50, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(units = 10, activation='softmax'))
    
    opt = SGD(lr=lrate)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=200, verbose=0)
    pyplot.plot(history.history['accuracy'], label='train')
    pyplot.plot(history.history['val_accuracy'], label='test')
    pyplot.title('lr='+str(lrate), pad=-35)
learning_rates = [1E-0, 1E-1, 1E-2, 1E-3, 1E-4, 1E-5, 1E-6, 1E-7]
for i in range(len(learning_rates)):
    plot_no = 420 + (i+1)
    pyplot.subplot(plot_no)
    fit_model(X_train, y_train, X_test, y_test, learning_rates[i])
pyplot.tight_layout()
pyplot.savefig('lr1.jpeg', dpi=500)
pyplot.show()

Momentum(β)

เป็นเทคนิคในการลดการแกว่งของ Learning Curves พร้อมกับเร่งอัตราการเรียนรู้ของ Model ให้เร็วขึ้น โดยใช้ Velocity (ความเร็ว) ของรอบก่อนหน้า และ Error ในรอบปัจจุบันเพื่อปรับปรุง Weight และ Bias ด้วยค่าน้ำหนักตามที่กำหนด ซึ่งโดยปกติจะมีการให้น้ำหนัก Velocity ในรอบก่อนหน้ามากกว่า Error ในรอบปัจจุบัน

def fit_model(trainX, trainy, testX, testy, momentum):
    model = Sequential()
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(units = 50, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(units = 10, activation='softmax'))
    opt = SGD(lr=0.01, momentum=momentum)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=200, verbose=0)
    pyplot.plot(history.history['accuracy'], label='train')
    pyplot.plot(history.history['val_accuracy'], label='test')
    pyplot.title('momentum='+str(momentum), pad=-80)
 
momentums = [0.0, 0.5, 0.9, 0.99]
for i in range(len(momentums)):
    plot_no = 220 + (i+1)
    pyplot.subplot(plot_no)
    fit_model(X_train, y_train, X_test, y_test, momentums[i])
pyplot.tight_layout()
pyplot.savefig('momentum.jpeg', dpi=300)
pyplot.show()

Learning Rate Decay

คือ การค่อย ๆ ลด Learning Rate ในแต่ละ Epoch ในอัตราที่เหมาะสม

def decay_lrate(initial_lrate, decay, iteration):
    return initial_lrate * (1.0 / (1.0 + decay * iteration))
 
decays = [1E-1, 1E-2, 1E-3, 1E-4]
lrate = 0.01
n_updates = 200
for decay in decays:
    
    lrates = [decay_lrate(lrate, decay, i) for i in range(n_updates)]
    
    pyplot.plot(lrates, label=str(decay))
pyplot.legend()
pyplot.savefig('decay.jpeg', dpi=300)
pyplot.show()

นิยาม Model, กำหนด lr = 0.01, Complie Model และ Plot Accuracy

def fit_model(trainX, trainy, testX, testy, decay):
    model = Sequential()
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(units = 50, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(units = 10, activation='softmax'))
    
    opt = SGD(lr=0.01, decay=decay)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    history = model.fit(trainX, trainy, validation_data=(testX, testy),epochs=200,verbose=0)
    pyplot.plot(history.history['accuracy'], label='train')
    pyplot.plot(history.history['val_accuracy'], label='test')
    pyplot.title('decay='+str(decay), pad=-80)decay_rates = [1E-1, 1E-2, 1E-3, 1E-4]
for i in range(len(decay_rates)):
    plot_no = 220 + (i+1)
    pyplot.subplot(plot_no)
    fit_model(X_train, y_train, X_test, y_test,     decay_rates[i])pyplot.legend()
pyplot.savefig('decay2.jpeg', dpi=300)
pyplot.show()

Drop Learning Rate on Plateau

เมื่อ Loss Value ไม่ลดลงเป็นระยะเวลาหนึ่ง เรียกสถานการณ์นี้ว่า “การเจอที่ราบสูง (Plateau)” เราอาจจะใช้เทคนิคการปรับลดค่า Learning Rate ด้วยการคูณกับค่า factor โดยเราจะเรียกแต่ละรอบใน Epoch ว่า “patience”

from tensorflow.keras.callbacks import Callback
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import backend

นิยาม Model

class LearningRateMonitor(Callback):
    def on_train_begin(self, logs={}):
        self.lrates = list()
 
    def on_epoch_end(self, epoch, logs={}):
        optimizer = self.model.optimizer
        lrate = float(backend.get_value(self.model.optimizer.lr))
        self.lrates.append(lrate)
 
def fit_model(trainX, trainy, testX, testy, patience):
    model = Sequential()
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(units = 50, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(units = 10, activation='softmax'))opt = SGD(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=patience, min_delta=1E-7)
    lrm = LearningRateMonitor()
    history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=200, verbose=0, callbacks=[rlrp, lrm])
    return lrm.lrates, history.history['loss'], history.history['accuracy']def line_plots(patiences, series, st):
    for i in range(len(patiences)):
        pyplot.subplot(220 + (i+1))
        pyplot.plot(series[i])
        pyplot.title('patience='+str(patiences[i]), pad=-80)
    pyplot.legend()
    pyplot.savefig('patience.jpeg', dpi=300)
    pyplot.show()
 
patiences = [2, 5, 10, 15]
lr_list, loss_list, acc_list, = list(), list(), list()
for i in range(len(patiences)):
    lr, loss, acc = fit_model(X_train, y_train, X_test, y_test, patiences[i])
    lr_list.append(lr)
    loss_list.append(loss)
    acc_list.append(acc)
line_plots(patiences, lr_list, 'lr')
line_plots(patiences, loss_list, 'loss')
line_plots(patiences, acc_list, 'acc')

Adaptive Learning Rates Gradient Descent

เปรียบเทียบ Adaptive Learning Rate Algorithm ได้แก่ AdaGrad (Adaptive Gradient Algorithm), RMSprop (Root Mean Square Propagation) และ Adam (Adaptive Moment Estimation) กับ Optimizer พื้นฐาน (SGD Optimizer)

def fit_model(trainX, trainy, testX, testy, optimizer):
    model = Sequential()
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(units = 50, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(units = 10, activation='softmax'))    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])    history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=200, verbose=0)    pyplot.plot(history.history['accuracy'], label='train')
    pyplot.plot(history.history['val_accuracy'], label='test')
    pyplot.title('opt='+optimizer, pad=-80)momentums = ['sgd', 'rmsprop', 'adagrad', 'adam']
for i in range(len(momentums)):
    plot_no = 220 + (i+1)
    pyplot.subplot(plot_no)fit_model(X_train, y_train, X_test, y_test, momentums[i])pyplot.legend()
pyplot.savefig('adaptive.jpeg', dpi=300)    
pyplot.show()

สำหรับการปรับค่า Learning Rate ด้วยวิธีการต่างๆ หากผิดพลาดประการใด ขออภัยมา ณ ที่นี้ขอบคุณครับ

The Effects of the Learning Rate on Model Performance

Sign up to discover human stories that deepen your understanding of the world.

Free

Membership

Written by Phromma P

No responses yet