模型优化学习笔记—对比各种梯度下降算法

import mathimport numpy as np
from opt_utils import *
import matplotlib.pyplot as plt# 标准梯度下降
def update_parameters_with_gd(parameters, grads, learning_rate):L = len(parameters) // 2for l in range(1, L + 1):parameters[f"W{l}"] = parameters[f"W{l}"] - learning_rate * grads[f"dW{l}"]parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * grads[f"db{l}"]return parameters# mini-batch梯度下降
def random_mini_batches(X, Y, mini_batch_size=64, seed=0):np.random.seed(seed)m = X.shape[1]mini_batches = []permutation = list(np.random.permutation(m))# 洗牌shuffled_X = X[:, permutation]shuffled_Y = Y[:, permutation].reshape((1, m))# 分割num_complete_minibatches = math.floor(m / mini_batch_size)for k in range(0, num_complete_minibatches):mini_batch_X = shuffled_X[:, k * mini_batch_size:(k + 1) * mini_batch_size]mini_batch_Y = shuffled_Y[:, k * mini_batch_size:(k + 1) * mini_batch_size]mini_batch = (mini_batch_X, mini_batch_Y)mini_batches.append(mini_batch)if m % mini_batch_size != 0:mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size:]mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size:]mini_batch = (mini_batch_X, mini_batch_Y)mini_batches.append(mini_batch)return mini_batches# 动量梯度下降
def initialize_velocity(parameters):L = len(parameters) // 2v = {}for l in range(1, L + 1):v[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])v[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])return vdef update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):L = len(parameters) // 2for l in range(L):v["dW" + str(l + 1)] = beta * v["dW" + str(l + 1)] + (1 - beta) * grads['dW' + str(l + 1)]v["db" + str(l + 1)] = beta * v["db" + str(l + 1)] + (1 - beta) * grads['db' + str(l + 1)]parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * v["dW" + str(l + 1)]parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * v["db" + str(l + 1)]return parameters, v# Adam梯度下降
def initialize_adam(parameters):L = len(parameters) // 2v = {}s = {}for l in range(L):v["dW" + str(l + 1)] = np.zeros_like(parameters["W" + str(l + 1)])v["db" + str(l + 1)] = np.zeros_like(parameters["b" + str(l + 1)])s["dW" + str(l + 1)] = np.zeros_like(parameters["W" + str(l + 1)])s["db" + str(l + 1)] = np.zeros_like(parameters["b" + str(l + 1)])return v, sdef update_parameters_with_adam(parameters, grads, v, s, t, learning_rate=0.01,beta1=0.9, beta2=0.999, epsilon=1e-8):L = len(parameters) // 2v_corrected = {}s_corrected = {}for l in range(L):v["dW" + str(l + 1)] = beta1 * v["dW" + str(l + 1)] + (1 - beta1) * grads['dW' + str(l + 1)]v["db" + str(l + 1)] = beta1 * v["db" + str(l + 1)] + (1 - beta1) * grads['db' + str(l + 1)]v_corrected["dW" + str(l + 1)] = v["dW" + str(l + 1)] / (1 - np.power(beta1, t))v_corrected["db" + str(l + 1)] = v["db" + str(l + 1)] / (1 - np.power(beta1, t))s["dW" + str(l + 1)] = beta2 * s["dW" + str(l + 1)] + (1 - beta2) * np.power(grads['dW' + str(l + 1)], 2)s["db" + str(l + 1)] = beta2 * s["db" + str(l + 1)] + (1 - beta2) * np.power(grads['db' + str(l + 1)], 2)s_corrected["dW" + str(l + 1)] = s["dW" + str(l + 1)] / (1 - np.power(beta2, t))s_corrected["db" + str(l + 1)] = s["db" + str(l + 1)] / (1 - np.power(beta2, t))parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * v_corrected["dW" + str(l + 1)] / np.sqrt(s_corrected["dW" + str(l + 1)] + epsilon)parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * v_corrected["db" + str(l + 1)] / np.sqrt(s_corrected["db" + str(l + 1)] + epsilon)return parameters, v, sdef model(X, Y, layers_dims, optimizer, learning_rate=0.0007, mini_batch_size=64, beta=0.9, beta1=0.9, beta2=0.999,epsilon=1e-8, num_epochs=10000, print_cost=True):L = len(layers_dims)costs = []t = 0seed = 10parameters = initialize_parameters(layers_dims)if optimizer == "gd":passelif optimizer == "momentum":v = initialize_velocity(parameters)elif optimizer == "adam":v, s = initialize_adam(parameters)# 一个epoch 就是遍历整个数据集一遍,一个epoch有多个mini-batchfor i in range(num_epochs):seed = seed + 1minibatches = random_mini_batches(X, Y, mini_batch_size, seed)for minibatch in minibatches:(minibatch_X, minibatch_Y) = minibatcha3, caches = forward_propagation(minibatch_X, parameters)cost = compute_cost(a3, minibatch_Y)grads = backward_propagation(minibatch_X, minibatch_Y, caches)if optimizer == "gd":parameters = update_parameters_with_gd(parameters, grads, learning_rate)elif optimizer == "momentum":parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)elif optimizer == "adam":t = t + 1parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1, beta2,epsilon)if print_cost and i % 1000 == 0:print("Cost after epoch %i: %f" % (i, cost))if print_cost and i % 100 == 0:costs.append(cost)return parameters, costsdef training1(train_X, train_Y, layers_dims):# 标准mini-batch梯度下降parameters, costs = model(train_X, train_Y, layers_dims, optimizer="gd")p = predict_dec(parameters, train_X)print("Accuracy: " + str(np.mean((p[0, :] == train_Y[0, :]))))plt.subplot(131)plt.plot(costs)plt.ylabel('cost')plt.xlabel('epochs (per 100)')plt.title("Learning rate = " + str(0.0007))# plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y.ravel())def training2(train_X, train_Y, layers_dims):# mini-batch动量梯度下降parameters, costs = model(train_X, train_Y, layers_dims, optimizer="momentum")p = predict_dec(parameters, train_X)print("Accuracy: " + str(np.mean((p[0, :] == train_Y[0, :]))))plt.subplot(132)plt.plot(costs)plt.ylabel('cost')plt.xlabel('epochs (per 100)')# plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y.ravel())def training3(train_X, train_Y, layers_dims):# mini-batch Adam梯度下降parameters, costs = model(train_X, train_Y, layers_dims, optimizer="adam")p = predict_dec(parameters, train_X)print("Accuracy: " + str(np.mean((p[0, :] == train_Y[0, :]))))plt.subplot(133)plt.plot(costs)plt.ylabel('cost')plt.xlabel('epochs (per 100)')# plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y.ravel())if __name__ == "__main__":train_X, train_Y = load_dataset()layers_dims = [train_X.shape[0], 5, 2, 1]print("mini batch with gd:")training1(train_X, train_Y, layers_dims)print("mini batch with momentum:")training2(train_X, train_Y, layers_dims)print("mini batch with adam:")training3(train_X, train_Y, layers_dims)plt.show()

util:

import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy.io
import sklearn
import sklearn.datasetsdef sigmoid(x):"""Compute the sigmoid of xArguments:x -- A scalar or numpy array of any size.Return:s -- sigmoid(x)"""s = 1/(1+np.exp(-x))return sdef relu(x):"""Compute the relu of xArguments:x -- A scalar or numpy array of any size.Return:s -- relu(x)"""s = np.maximum(0,x)return sdef load_params_and_grads(seed=1):np.random.seed(seed)W1 = np.random.randn(2,3)b1 = np.random.randn(2,1)W2 = np.random.randn(3,3)b2 = np.random.randn(3,1)dW1 = np.random.randn(2,3)db1 = np.random.randn(2,1)dW2 = np.random.randn(3,3)db2 = np.random.randn(3,1)return W1, b1, W2, b2, dW1, db1, dW2, db2def initialize_parameters(layer_dims):"""Arguments:layer_dims -- python array (list) containing the dimensions of each layer in our networkReturns:parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":W1 -- weight matrix of shape (layer_dims[l], layer_dims[l-1])b1 -- bias vector of shape (layer_dims[l], 1)Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l])bl -- bias vector of shape (1, layer_dims[l])Tips:- For example: the layer_dims for the "Planar Data classification model" would have been [2,2,1]. This means W1's shape was (2,2), b1 was (1,2), W2 was (2,1) and b2 was (1,1). Now you have to generalize it!- In the for loop, use parameters['W' + str(l)] to access Wl, where l is the iterative integer."""np.random.seed(3)parameters = {}L = len(layer_dims) # number of layers in the networkfor l in range(1, L):parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*  np.sqrt(2 / layer_dims[l-1])parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1])assert(parameters['W' + str(l)].shape == layer_dims[l], 1)return parametersdef compute_cost(a3, Y):"""Implement the cost functionArguments:a3 -- post-activation, output of forward propagationY -- "true" labels vector, same shape as a3Returns:cost - value of the cost function"""m = Y.shape[1]logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)cost = 1./m * np.sum(logprobs)return costdef forward_propagation(X, parameters):"""Implements the forward propagation (and computes the loss) presented in Figure 2.Arguments:X -- input dataset, of shape (input size, number of examples)parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":W1 -- weight matrix of shape ()b1 -- bias vector of shape ()W2 -- weight matrix of shape ()b2 -- bias vector of shape ()W3 -- weight matrix of shape ()b3 -- bias vector of shape ()Returns:loss -- the loss function (vanilla logistic loss)"""# retrieve parametersW1 = parameters["W1"]b1 = parameters["b1"]W2 = parameters["W2"]b2 = parameters["b2"]W3 = parameters["W3"]b3 = parameters["b3"]# LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOIDz1 = np.dot(W1, X) + b1a1 = relu(z1)z2 = np.dot(W2, a1) + b2a2 = relu(z2)z3 = np.dot(W3, a2) + b3a3 = sigmoid(z3)cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)return a3, cachedef backward_propagation(X, Y, cache):"""Implement the backward propagation presented in figure 2.Arguments:X -- input dataset, of shape (input size, number of examples)Y -- true "label" vector (containing 0 if cat, 1 if non-cat)cache -- cache output from forward_propagation()Returns:gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables"""m = X.shape[1](z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cachedz3 = 1./m * (a3 - Y)dW3 = np.dot(dz3, a2.T)db3 = np.sum(dz3, axis=1, keepdims = True)da2 = np.dot(W3.T, dz3)dz2 = np.multiply(da2, np.int64(a2 > 0))dW2 = np.dot(dz2, a1.T)db2 = np.sum(dz2, axis=1, keepdims = True)da1 = np.dot(W2.T, dz2)dz1 = np.multiply(da1, np.int64(a1 > 0))dW1 = np.dot(dz1, X.T)db1 = np.sum(dz1, axis=1, keepdims = True)gradients = {"dz3": dz3, "dW3": dW3, "db3": db3,"da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,"da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}return gradientsdef predict(X, y, parameters):"""This function is used to predict the results of a  n-layer neural network.Arguments:X -- data set of examples you would like to labelparameters -- parameters of the trained modelReturns:p -- predictions for the given dataset X"""m = X.shape[1]p = np.zeros((1,m), dtype = np.int)# Forward propagationa3, caches = forward_propagation(X, parameters)# convert probas to 0/1 predictionsfor i in range(0, a3.shape[1]):if a3[0,i] > 0.5:p[0,i] = 1else:p[0,i] = 0# print results#print ("predictions: " + str(p[0,:]))#print ("true labels: " + str(y[0,:]))print("Accuracy: "  + str(np.mean((p[0,:] == y[0,:]))))return pdef load_2D_dataset():data = scipy.io.loadmat('datasets/data.mat')train_X = data['X'].Ttrain_Y = data['y'].Ttest_X = data['Xval'].Ttest_Y = data['yval'].Tplt.scatter(train_X[0, :], train_X[1, :], c=train_Y, s=40, cmap=plt.cm.Spectral);return train_X, train_Y, test_X, test_Ydef plot_decision_boundary(model, X, y):# Set min and max values and give it some paddingx_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1h = 0.01# Generate a grid of points with distance h between themxx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))# Predict the function value for the whole gridZ = model(np.c_[xx.ravel(), yy.ravel()])Z = Z.reshape(xx.shape)# Plot the contour and training examplesplt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)plt.ylabel('x2')plt.xlabel('x1')plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)plt.show()def predict_dec(parameters, X):"""Used for plotting decision boundary.Arguments:parameters -- python dictionary containing your parameters X -- input data of size (m, K)Returnspredictions -- vector of predictions of our model (red: 0 / blue: 1)"""# Predict using forward propagation and a classification threshold of 0.5a3, cache = forward_propagation(X, parameters)predictions = (a3 > 0.5)return predictionsdef load_dataset():np.random.seed(3)train_X, train_Y = sklearn.datasets.make_moons(n_samples=300, noise=.2) #300 #0.2 # Visualize the data# plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral);train_X = train_X.Ttrain_Y = train_Y.reshape((1, train_Y.shape[0]))return train_X, train_Y
mini batch with gd:
Cost after epoch 0: 0.690736
Cost after epoch 1000: 0.685273
Cost after epoch 2000: 0.647072
Cost after epoch 3000: 0.619525
Cost after epoch 4000: 0.576584
Cost after epoch 5000: 0.607243
Cost after epoch 6000: 0.529403
Cost after epoch 7000: 0.460768
Cost after epoch 8000: 0.465586
Cost after epoch 9000: 0.464518
Accuracy: 0.7966666666666666
mini batch with momentum:
Cost after epoch 0: 0.690741
Cost after epoch 1000: 0.685341
Cost after epoch 2000: 0.647145
Cost after epoch 3000: 0.619594
Cost after epoch 4000: 0.576665
Cost after epoch 5000: 0.607324
Cost after epoch 6000: 0.529476
Cost after epoch 7000: 0.460936
Cost after epoch 8000: 0.465780
Cost after epoch 9000: 0.464740
Accuracy: 0.7966666666666666
mini batch with adam:
Cost after epoch 0: 0.690552
Cost after epoch 1000: 0.185501
Cost after epoch 2000: 0.150830
Cost after epoch 3000: 0.074454
Cost after epoch 4000: 0.125959
Cost after epoch 5000: 0.104344
Cost after epoch 6000: 0.100676
Cost after epoch 7000: 0.031652
Cost after epoch 8000: 0.111973
Cost after epoch 9000: 0.197940
Accuracy: 0.94

在这里插入图片描述

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.rhkb.cn/news/392976.html

如若内容造成侵权/违法违规/事实不符,请联系长河编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

自己动手实现scikit库中的fit和transform方法

文本分析第一步要解决的是如何将文本非结构化信息转化为结构化信息,其中最关键的是特征抽取,我们使用scikit-learn库fit和tranform方法实现了文本数据的特征抽取。 但是对于fit和transform,大家可能还是有点迷糊。最近又将《Applied Text An…

如何用一个以太网回环短接器激活以太网接口:以太网短接口制作

在非常特殊的情况下,我们需要在没有接以太网的情况下,使用本地的以太网(有些程序、代码必须上网才能运行)。这时候需要插上一个以太网短接口,骗系统已经插上网线。 制作以太网短接口 以太网短接口的制作非常简单&…

Linux OS:基于阻塞队列的生产者消费者模型

Linux OS:基于阻塞队列的生产者消费者模型 前言一、阻塞队列的大致框架二、生产者向阻塞队列中生产数据三、消费者获取阻塞队列中数据四、总体生产和消费思路及测试代码4.1 单生产单消费4.2 多生产多消费 五、所以代码 前言 阻塞队列是一种常用于实现生产者消费者模…

低代码: 系统开发准备之确定一般开发流程,需求分析,复杂度分析,标准开发流程

概述 低代码系统开发之前,我们首先要进行一些准备我们首先知道我们软件开发的一般流程同时,我们还要知道,我们整个系统平台的需求如何之后,我们要基于需求进行设计,包含UI设计与系统架构设计 一般开发流程 系统开发…

电路中电阻,电容和电感作用总结

电阻作用 1,上拉电阻 电阻的连接一般是一端接上拉的电源(一般与芯片信号的电压值相匹配),另一端连接芯片引脚所对应的信号大概如下图 功能:一、预置某些引脚的功能,例如复位信号拉高(失能&…

在 VueJS 中使用事件委托处理点击事件(事件委托,vue事件委托,什么是事件委托,什么是vue的事件委托)

前言 在开发 Vue 项目时,我们经常需要处理大量的点击事件。为每个可点击的元素单独添加事件监听器不仅会增加代码的复杂度,还会降低性能。事件委托是一种有效的优化方式,它可以显著减少事件监听器的数量,提高代码的可维护性和执行…

用Python+selenium实现一个自动化测试脚本

一,安装Python. python官方下载地址:Download Python | Python.org 安装后点击开始菜单,在菜单最上面能找到IDLE. IDLE是python自带的shell, 点击打开, 即可开始编写python脚本了. 二,安装selenium 上面python已安装完成,接下来安装selenium. 安装selenium之前需要…

P1105 平台

平台 题目描述 空间中有一些平台。给出每个平台的位置,请你计算从每一个平台的边缘落下之后会落到哪一个平台上。 注意,如果某两个平台的某个两边缘横坐标相同,物体从上面那个平台落下之后将不会落在下面那个平台上(即平台的范…

网络工具(Netcat、iPerf)

目录 1. Netcat2. iPerf 1. Netcat Netcat 是一款简单的 Unix 工具,常用于测试 UDP 和 TCP 连接。 https://www.cnblogs.com/yywf/p/18154209 https://eternallybored.org/misc/netcat/ https://nmap.org/download.html 创建UDP监听端 nc -u -l localPort 创建UDP…

业务开发之用户管理(七)

云风网 云风笔记 云风知识库 首先从逻辑上,用户管理只限制admin用户显示 一、路由限制用户管理的访问权限 config/routes.ts添加access:admin权限限制 {name: userManage,icon: table,access: canAdmin,path: /userManage,component: ./userManage,}二…

Flink 实时数仓(四)【DWD 层搭建(二)流量域事实表】

前言 昨天刚搬到新校区,新校区小的可怜,好在之后出去实习交通可以方便点;待在学院太受限了,早点离开! 今天开始完成 DWD 层剩余的需求,上一节我们把日志数据根据不同类型分流写入到了不同的主题&#xff1b…

云端医疗解决方案:互联网医院系统的云计算架构与实现

随着云计算技术的成熟和普及,医疗行业开始探索云端解决方案,以应对数据存储、计算能力和系统扩展性等方面的挑战。互联网医院系统作为医疗信息化的重要组成部分,通过云计算架构实现了高效、灵活和可扩展的医疗服务。本文将深入探讨互联网医院…

2024 AI开发者大赛火热进行中!

“iFLYTEK AI 开发者大赛”是由科大讯飞发起,中国信息协会联合主办的人工智能竞赛平台,汇聚产学研各界力量,面向全球开发者发起数据算法及创新应用类挑战,推动人工智能前沿科学研究和创新成果转化,培育人工智能产业人才…

YOLOv10改进 | 主干篇 | YOLOv10引入CVPR2023 顶会论文BiFormer用于主干修改

1. 使用之前用于注意力的BiFormer在这里用于主干修改。 YOLOv10改进 | 注意力篇 | YOLOv10引入BiFormer注意力机制 2. 核心代码 from collections import OrderedDict from functools import partial from typing import Optional, Union import torch import torch.nn as n…

如何评估并选择最佳的国内项目管理软件?

国内外主流的10款国内项目管理软件对比:PingCode、Worktile、Jira 、Basecamp、Trello、Asana 、Wrike、Tower 、禅道、Teambition 。 在选择适合自己企业的项目管理软件时,很多人会感到无从下手,担心无法找到既符合预算又能满足团队需求的解…

上网防泄密,这些雷区不要碰!九招教你如何防泄密

李明:“最近看到不少关于信息泄露的新闻,真是让人担忧。咱们在工作中,稍有不慎就可能触碰到泄密的雷区啊。” 王芳:“确实,网络安全无小事。尤其是我们这种经常需要处理敏感信息的岗位,更得小心谨慎。那你…

一行代码教你使用Python制作炫酷二维码

二维码,我们日常生活中随处可见的编码方式,凭借其方便快捷的信息承载能力,已经渗透到各行各业。 MyQR 的介绍 MyQR 是一个 Python 库,用于生成自定义二维码,包括带有 Logo、彩色和动态的二维码。它基于 Python 的 qr…

书生大模型实战营第三期——入门岛——Git基础知识

第三关:Git基础知识 任务如下: 任务描述 破冰活动:自我介绍 每位参与者提交一份自我介绍。 提交地址:GitHub - InternLM/Tutorial: LLM&VLM Tutorial 的 camp3 分支~实践项目:构建个人项目 创建一个个人…

电脑硬盘坏了数据可以恢复吗?如何恢复硬盘数据?

电脑硬盘坏了数据可以恢复吗?对于这种问题,还需要具体问题具体分析的,一般是可以恢复。 硬盘损坏可以分为物理损坏和逻辑损坏两种情况: 1.逻辑损坏 这通常是由于软件问题,如文件系统错误、病毒攻击、误删除、格式化等…

未发先火,Smartbi AIChat频频“出圈”

近日,思迈特正式官宣,将于8月8日线上新品发布会上推出自研的全新AI应用——Smartbi AIChat,这款应用在还未正式推向市场前,已获得媒体、分析机构等多方关注,热度飙升,思迈特软件及其新品再一次成为业界内外…