Pytorch 源码学习 - 个人主页

0.说明

我将解读pytorch 的部分代码主要的是torch文件夹下的东西做一些源码注释解释和说明使用，说明的层次主要停留在python阶段对于中层的C++只做部分解释，底层的C不解释。主要目的是通读阅读该文档可以熟练的使用pytorch进行开发任务，把想要的模型从脑袋里搬出来，把已经抽象成数学公式问题搬到计算机上，实现对模型的任意改进。我会从源码的每个常用函数开始解读，差分细节为函数和类。针对torch文件夹下的所有文件会依次分析，其中具有引用关系的会先分析关系再进行分析，每个文件中只考虑关键的函数和类，同时会进行演示使用。这其中需要用到很多python的知识涉及的相对较广需要有一定的python基础。

1.`config.py`

这个文件主要是查看一些安装信息和配置信息包含了两个函数

def show():
    return torch._C._show_config() #返回安装Pytorch时的系统环境

def parallel_info():
    return torch._C._parallel_info() #返回系统环境的线程个数

import torch
torch.__config__.show()
torch.__config__.parallel_info()

'ATen/Parallel:\n\tat::get_num_threads() : 28\n\tat::get_num_interop_threads() : 28\nOpenMP 201511 (a.k.a. OpenMP 4.5)\n\tomp_get_max_threads() : 28\nIntel(R) Math Kernel Library Version 2019.0.4 Product Build 20190411 for Intel(R) 64 architecture applications\n\tmkl_get_max_threads() : 28\nIntel(R) MKL-DNN v0.20.5 (Git Hash 0125f28c61c1f822fd48570b4c1066f96fcb9b2e)\nstd::thread::hardware_concurrency() : 56\nEnvironment variables:\n\tOMP_NUM_THREADS : [not set]\n\tMKL_NUM_THREADS : [not set]\nATen parallel backend: OpenMP\n'

2.`future.py`

用来设置模型的转换模式可行性主要包括设备，是在GPU还是CPU 还有数据类型包括单精度浮点和双精度浮点。这这里我们需要明确的是数据和模型必须是同一的才能运行,需要考虑设备和数据类型。

`module.cuda()` / `.cpu()`  设别之间转化
`module.float()` / `.double()` / `.half()`  数据类型的转化
`module.to()` / `.type()`  同上，多了to没有任何变化
`module._apply(fn)`   #模型的数据类型转化,可以传入一个我们自定义的函数对模型进行转化

_overwrite_module_params_on_conversion = False

def set_overwrite_module_params_on_conversion(value):
    global _overwrite_module_params_on_conversion
    _overwrite_module_params_on_conversion = value

def get_overwrite_module_params_on_conversion():
    return _overwrite_module_params_on_conversion

import torch.nn as nn
class test(nn.Module): #首先构建一个模型
    def __init__(self):
        super(test,self).__init__()
        self.Li=nn.Linear(1,1)
    def forward(self,x):
        return self.Li(x)
mode=test()  #
mode.cpu() #需要具有GPU的设备上才可以使用cuda()
a=torch.rand(1,1).double()  #默认状态下我们生成的数据类型主要是float类型
mode.double()(a)

tensor([[0.5623]], dtype=torch.float64, grad_fn=<AddmmBackward>)

3.`init.py`

这个文件是python类包的基础文件主要定义了一些默认调用的快捷方法，这个文件主要用来使用系统级别的工具时会用到，同时torch基础性的函数和方法也会用到，使用到了三个对系统操作的库ossysplatfrom。os主要是用来操作一些操作系统的工作，sys主要是用来操作一些便编译环境，platfrom是对系统环境进行操作。主要是用来配置操作系统和环境变量的配置。着其中主要是对windows的操作，可见windows确实是个麻烦的系统。

def typename(o):   #这个函数主要是用来返回输入的类型，返回模块信息和类信息，是从模块
                               #名字和类名字来的
    if isinstance(o, torch.Tensor):
        return o.type()

    module = ''   # o.__module__返回o所在模块的名字
    class_name = '' #o.__class__返回o所在类的名字
    if hasattr(o, '__module__') and o.__module__ != 'builtins' \
            and o.__module__ != '__builtin__' and o.__module__ is not None:
        module = o.__module__ + '.'

    if hasattr(o, '__qualname__'):
        class_name = o.__qualname__
    elif hasattr(o, '__name__'):
        class_name = o.__name__
    else:
        class_name = o.__class__.__name__

    return module + class_name

def is_tensor(obj): #返回变量类型是否是tensor

    return isinstance(obj, torch.Tensor)

import torch
a=torch.nn.Linear(1,2)
b=torch.rand(1)
torch.typename(a)
torch.typename(b)
torch.is_tensor(a)
torch.is_tensor(b)

True

def is_storage(obj): #返回 对象是否在_storage_classes中

    return type(obj) in _storage_classes

_storage_classes = {
    DoubleStorage, FloatStorage, LongStorage, IntStorage, ShortStorage,
    CharStorage, ByteStorage, HalfStorage, BoolStorage, QUInt8Storage, QInt8Storage,
    QInt32Storage
}

4.`_six.py`

主要是定义python2和python3的读写操作，和一些不同版本的事适配操作

5.functional.py

这是个非常重要的包，包含了几乎所有的数学运算，这个包是对底层数学方法的封装使用。后面的模型计算主要是调用这个包中的方法进行计算，我会详解介绍这个包中的函数，方法的c++实现机会不介绍，知道进行，反正我也不懂。

1.广播

def broadcast_tensors(*tensors): 广播给定的张量，
                    ##说一下广播，广播就是实现不同维度的数据可以进行计算。虽然在数学上不能计算

    return torch._C._VariableFunctions.broadcast_tensors(tensors)

a=torch.rand(2,3)
b=torch.rand(1)
torch.broadcast_tensors(b,a)  #可以实现将两个数据变化为可进行操作的程度。

(tensor([[0.9848, 0.9848, 0.9848],
         [0.9848, 0.9848, 0.9848]]), tensor([[0.1015, 0.0311, 0.2963],
         [0.4183, 0.8421, 0.1063]]))

2.切块

def split(tensor, split_size_or_sections, dim=0): 将张量在某个维度上进行切分

    return tensor.split(split_size_or_sections, dim)

a=torch.linspace(1,12,12).view(2,6)
torch.split(a,2,-1)

(tensor([[1., 2.],
         [7., 8.]]), tensor([[ 3.,  4.],
         [ 9., 10.]]), tensor([[ 5.,  6.],
         [11., 12.]]))

3.LU分解

将一个矩阵分解为一个上三角阵和一个下三角阵的乘积。说说如何进行矩阵LU分解，假设一个可分解的矩阵A 使用基础变化将A举证变化为下三角矩阵，变化为一个矩阵。A=[[1,2,-1],[3,1,0],[-1,-1,-1]]。将A的第一行乘-3加给第二行，记为E(12(-3)),经过E(12(-3)),E(13(1)),E(23(1/5))变化为U=[[1,2,-1],[0,5,3],[0,0,-12/5]] 其中的变化可以表示为L。A=L*U

def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
  
    sz = LU_data.size(-1)

    if unpack_data:
        U = LU_data.triu()
        L = LU_data.tril()
        L.diagonal(dim1=-2, dim2=-1).fill_(1)
    else:
        L = U = None

    if unpack_pivots:
        LU_pivots_zero_idx = LU_pivots - 1
        if LU_data.dim() > 2:
            P = torch.eye(sz, device=LU_data.device, dtype=LU_data.dtype).expand_as(LU_data).clone()
            for idx in product(*map(lambda x: list(range(x)), LU_data.shape[:-2])):
                final_order = list(range(sz))
                for k, j in enumerate(LU_pivots_zero_idx[idx]):
                    final_order[k], final_order[j] = final_order[j], final_order[k]
                P[idx] = P[idx].index_select(1, torch.as_tensor(final_order, device=LU_pivots.device))
        else:
            P = torch.eye(sz, device=LU_data.device, dtype=LU_data.dtype)
            final_order = list(range(sz))
            for k, j, in enumerate(LU_pivots_zero_idx):
                final_order[k], final_order[j] = final_order[j], final_order[k]
            P = P.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device))
    else:
        P = None

    return P, L, U 

A = torch.randn(2, 3, 3)
A_LU, pivots = A.lu()
P, A_L, A_U = torch.lu_unpack(A_LU, pivots)
A==torch.bmm(P, torch.bmm(A_L, A_U))

tensor([[[ True,  True,  True],
         [ True, False,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [False,  True,  True],
         [False,  True,  True]]])

4.爱因斯坦求和约定

可以理解为将字符串表达式实例化进行计算

def einsum(equation, *operands):

    if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
        # the old interface of passing the operands as one list argument
        operands = operands[0]
    return torch._C._VariableFunctions.einsum(equation, operands)

a=torch.rand(3)
b=torch.rand(4)
torch.einsum('i,j->ij',a,b)

tensor([[0.1073, 0.1324, 0.1057, 0.0287],
        [0.2942, 0.3630, 0.2897, 0.0786],
        [0.3002, 0.3704, 0.2956, 0.0802]])

5.判断是否是有限数据

def isfinite(tensor):
  
    if not isinstance(tensor, torch.Tensor): #只能判断张量
        raise TypeError("The argument is not a tensor: {}".format(repr(tensor)))

    # Support int input, nan and inf are concepts in floating point numbers.
    # Numpy uses type 'Object' when the int overflows long, but we don't
    # have a similar concept. It's safe to assume any created LongTensor doesn't
    # overflow and it's finite.
    if not tensor.is_floating_point():
        return torch.ones_like(tensor, dtype=torch.uint8)
    return (tensor == tensor) & (tensor.abs() != inf)

torch.isfinite(a)

tensor([True, True, True])

6.网格生成

我们可以使用这个东西来生成参数网格空间，可以找到所有的参数对数据

def meshgrid(*tensors, **kwargs):

    if kwargs:  不能包含字典类型数据
        raise TypeError("meshgrid() got an unexpected keyword argument '%s'" % (list(kwargs)[0],))
    if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)):
        # the old interface of passing the operands as one list argument
        tensors = tensors[0]
    return torch._C._VariableFunctions.meshgrid(tensors)

a=torch.tensor([1,2,3,7])
b=torch.tensor([4,5,6])
torch.meshgrid(a,b)

(tensor([[1, 1, 1],
         [2, 2, 2],
         [3, 3, 3],
         [7, 7, 7]]), tensor([[4, 5, 6],
         [4, 5, 6],
         [4, 5, 6],
         [4, 5, 6]]))

7.短时傅立叶变化

def stft(input, n_fft, hop_length=None, win_length=None, window=None,
         center=True, pad_mode='reflect', normalized=False, onesided=True):
  
    if center:
        signal_dim = input.dim()
        extended_shape = [1] * (3 - signal_dim) + list(input.size())
        pad = int(n_fft // 2)
        input = F.pad(input.view(extended_shape), (pad, pad), pad_mode)
        input = input.view(input.shape[-signal_dim:])
    return torch._C._VariableFunctions.stft(input, n_fft, hop_length, win_length, window, normalized, onesided)

import numpy as np

def calc_stft(signal, sample_rate=16000, frame_size=0.025, frame_stride=0.01, winfunc=np.hamming, NFFT=512):

    # Calculate the number of frames from the signal
    frame_length = frame_size * sample_rate
    frame_step = frame_stride * sample_rate
    signal_length = len(signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
    num_frames = 1 + int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))
    # zero padding
    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))
    # Pad signal to make sure that all frames have equal number of samples 
    # without truncating any samples from the original signal
    pad_signal = np.append(signal, z)

    # Slice the signal into frames from indices
    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + \
            np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]
    # Get windowed frames
    frames *= winfunc(frame_length)
    # Compute the one-dimensional n-point discrete Fourier Transform(DFT) of
    # a real-valued array by means of an efficient algorithm called Fast Fourier Transform (FFT)
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
    # Compute power spectrum
    pow_frames = (1.0 / NFFT) * ((mag_frames) ** 2)

    return pow_frames

import scipy.io.wavfile
import matplotlib.pyplot as plt

# Read wav file
# "OSR_us_000_0010_8k.wav" is downloaded from http://www.voiptroubleshooter.com/open_speech/american.html
sample_rate, signal = scipy.io.wavfile.read("OSR_us_000_0010_8k.wav")
# Get speech data in the first 2 seconds
signal = signal[0:int(2. * sample_rate)]

# Calculate the short time fourier transform
pow_spec = calc_stft(signal, sample_rate)

plt.imshow(pow_spec)
plt.tight_layout()
plt.show()

png

7.独特性的判断

def unique(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    ##返回一个独特的tensor序列  我们可以设定返回是否经过排序的序列和索引和个数
    if dim is not None:
        output, inverse_indices, counts = torch._C._VariableFunctions.unique_dim(
            input,
            dim,
            sorted=sorted,
            return_inverse=return_inverse,
            return_counts=return_counts,
        )
    else:
        output, inverse_indices, counts = torch._unique2(
            input,
            sorted=sorted,
            return_inverse=return_inverse,
            return_counts=return_counts,
        )
    if return_inverse and return_counts:
        return output, inverse_indices, counts
    elif return_inverse:
        return output, inverse_indices
    elif return_counts:
        return output, counts
    else:
        return output

import torch
a=torch.tensor([1,2,4,2,3,1,3,4,2])
torch.functional.unique(a,return_counts=True,return_inverse=True)

(tensor([1, 2, 3, 4]),
 tensor([0, 1, 3, 1, 2, 0, 2, 3, 1]),
 tensor([2, 3, 2, 2]))

8.去掉连续的相同元素

def unique_consecutive(input, return_inverse=False, return_counts=False, dim=None):
    #一列元素如果有相同连续的数值则只保留一个,可以设定是否输出zhi
    output, inverse_indices, counts = torch._C._VariableFunctions.unique_consecutive(
        input, return_inverse=return_inverse, return_counts=return_counts, dim=dim)
    if return_inverse and return_counts:
        return output, inverse_indices, counts
    if return_inverse:
        return output, inverse_indices
    if return_counts:
        return output, counts
    return output

import torch
a=torch.tensor([1,2,3,2,1,2,1,1,1,1,2,3])
torch.functional.unique_consecutive(a,return_inverse=True)

(tensor([1, 2, 3, 2, 1, 2, 1, 2, 3]),
 tensor([0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8]))

9. 多维矩阵相乘

def tensordot(a, b, dims=2):
        #计算a，b矩阵的叉乘，有维度限制，具体是这样的首先取a的后两维和b的前两维叉乘计算

    if isinstance(dims, (list, tuple)) or \
       (isinstance(dims, torch.Tensor) and dims.numel() > 1):
        dims_a, dims_b = dims
    else:
        if isinstance(dims, torch.Tensor):
            dims = dims.item()
        dims_a = list(range(-dims, 0))
        dims_b = list(range(dims))
    return torch._C._VariableFunctions.tensordot(a, b, dims_a, dims_b)

计算公式 $r_{i_0,...,i_{m-d}, i_d,...,i_n} = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}.$

#具体操作例子1：
a=torch.ones(2,2)
b=torch.ones(2,3)*3
torch.functional.tensordot(a,b,1) #结果就是

tensor([[6., 6., 6.],
        [6., 6., 6.]])

a = torch.arange(60.).reshape(3, 4, 5)
b = torch.arange(24.).reshape(4, 3, 2)
torch.tensordot(a, b, dims=([1, 0], [0, 1]))  #计算过程中取出来的维度必须是相同的

tensor([[4400., 4730.],
        [4532., 4874.],
        [4664., 5018.],
        [4796., 5162.],
        [4928., 5306.]])

sum(sum(a[:,:,0].T*b[:,:,0]))   #具体的每个计算就是这样的

tensor(4400.)

10.笛卡尔积

#笛卡尔积是对集和的一种运算
def cartesian_prod(*tensors):

    return torch._C.__VariableFunctions.cartesian_prod(tensors)

a=torch.linspace(1,3,3)
b=torch.linspace(4,6,3)
torch.functional.cartesian_prod(a,b)  #可以快速构建锚点矩阵

tensor([[1., 4.],
        [1., 5.],
        [1., 6.],
        [2., 4.],
        [2., 5.],
        [2., 6.],
        [3., 4.],
        [3., 5.],
        [3., 6.]])

11.范数

#范数就是求数据模长的开放就是范数
def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):
     ndim = input.dim()

    # catch default case
    if dim is None and out is None and dtype is None:
        if p == "fro":
            return torch._C._VariableFunctions.frobenius_norm(input)
        elif p != "nuc":
            return torch._C._VariableFunctions.norm(input, p)

    if p == "fro":
        if dtype is not None:
            raise ValueError("dtype argument is not supported in frobenius norm")
        if dim is None:
            dim = tuple(range(ndim))
        if out is None:
            return torch._C._VariableFunctions.frobenius_norm(input, dim, keepdim=keepdim)
        return torch._C._VariableFunctions.frobenius_norm(input, dim, keepdim=keepdim, out=out)
    elif p == "nuc":
        if dtype is not None:
            raise ValueError("dtype argument is not supported in nuclear norm")
        if dim is None:
            if out is None:
                return torch._C._VariableFunctions.nuclear_norm(input, keepdim=keepdim)
            return torch._C._VariableFunctions.nuclear_norm(input, keepdim=keepdim, out=out)
        return torch._C._VariableFunctions.nuclear_norm(input, dim, keepdim=keepdim, out=out)
    else:
        if dim is None:
            dim = tuple(range(ndim))
        if out is None and dtype is None:
            return torch._C._VariableFunctions.norm(input, p, dim, keepdim=keepdim)
        elif out is None:
            return torch._C._VariableFunctions.norm(input, p, dim, keepdim=keepdim, dtype=dtype)
        elif dtype is None:
            return torch._C._VariableFunctions.norm(input, p, dim, keepdim=keepdim, out=out)
    return torch._C._VariableFunctions.norm(input, p, dim, keepdim=keepdim, dtype=dtype, out=out)

a=torch.tensor([1.,2.])
torch.functional.norm(a)

tensor(2.2361)

torch.sqrt(torch.tensor(5.))

tensor(2.2361)

12.矩阵链式乘法

def chain_matmul(*matrices):  #矩阵链式乘法

    return torch._C._VariableFunctions.chain_matmul(matrices)

a=torch.rand(2,2)
b=torch.rand(2,3)
c=torch.rand(3,4)
torch.functional.chain_matmul(a,b,c)

tensor([[0.0407, 0.0513, 0.1567, 0.1216],
        [0.2096, 0.2257, 0.5004, 0.3506]])

6.quasirandom.py

生成准随机数，当我们需要在高维空间生成随机数时可以使用这个其中生成的是Sobol序列，这个序列可以均匀的在0,1空间中生成随机数,具体的操作原理是这样的。假如我们要生成一个随机数，首先确定一个进制2，数字8 对应的随机数是多少呢。首先将8变成2进制数也就是1000然后成一个矩阵c这里我们假设矩阵c是单位阵，得到1000，从小数点处做镜像得到0.0001 此时0.0001为随机数。转化为10进制为0.0625。这个数就是随机数。
其中的class SobolEngine(object)就是做这个工作的

import torch
test=torch.quasirandom.SobolEngine(3) #我们可以在三维空间中生成均匀铺满空间的数据
a=test.draw(100)
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

#定义坐标轴
fig = plt.figure()
ax1 = plt.axes(projection='3d')
ax1.scatter3D(a[:,0],a[:,1],a[:,2], cmap='Blues')  #绘制散点图
plt.show()

png

a=torch.rand(100)
b=torch.rand(100)
c=torch.rand(100)
a.shape
fig = plt.figure()
ax1 = plt.axes(projection='3d')
ax1.scatter3D(a,b,c, cmap='Blues')  #绘制散点图
plt.show()

png

7.random.py

主要用来设定随机数生成的种子,我们生成每一批随机数都是一样的

def manual_seed(seed):#设定随机数生成种子
    r"""Sets the seed for generating random numbers. Returns a
    `torch.Generator` object.

    Args:
        seed (int): The desired seed.
    """
    seed = int(seed)
    import torch.cuda

    if not torch.cuda._in_bad_fork:
        torch.cuda.manual_seed_all(seed)

    return default_generator.manual_seed(seed)

a=torch.random.manual_seed(3)  #执行多次任然是这个结果
print(torch.rand(1))
torch.rand(1)

tensor([0.0043])

tensor([0.1056])

8.serialization.py

这个文件中最主要的两个函数一个是保存一个是导入，可以保存模型和导入模型等

1.保存模型

def save(obj, f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL):
#我们可以看到首先输入的是要保存的对象，其次是保存路径，再试保存方法，最后的模式

    return _with_file_like(f, "wb", lambda f: _save(obj, f, pickle_module, pickle_protocol))

def _with_file_like(f, mode, body):  #保存路劲,模式"wr" 使用的方法_save

    new_fd = False
    if isinstance(f, str) or \
            (sys.version_info[0] == 2 and isinstance(f, unicode)) or \
            (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)):
        new_fd = True
        f = open(f, mode)
    try:
        return body(f)
    finally:
        if new_fd:
            f.close()

def _save(obj, f, pickle_module, pickle_protocol):
    if sys.version_info[0] == 2: #查看版本信息
        import StringIO
        if isinstance(f, StringIO.StringIO):  #当是python2时不能保存到内存中
            msg = ('torch.save received unsupported StringIO.StringIO file object, whose '
                   'write method does not return the number of bytes written. '
                   'Please use something like io.BytesIO for torch.save instead.')
            raise RuntimeError(msg)

    import torch.nn as nn
    serialized_container_types = {}
    serialized_storages = {}

    def persistent_id(obj):
        # FIXME: the docs say that persistent_id should only return a string
        # but torch store returns tuples. This works only in the binary protocol
        # see
        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
        if isinstance(obj, type) and issubclass(obj, nn.Module):  ##当保存对象是模型时会有不同操作
            if obj in serialized_container_types:
                return None
            serialized_container_types[obj] = True
            source_file = source = None
            try:
                source_file = inspect.getsourcefile(obj)
                source = inspect.getsource(obj)## inspect模块用于收集python对象的信息，  
                            ##可以获取类或函数的参数的信息，源码，解析堆栈，对对象进行类型检查等等
            except Exception:  # saving the source is optional, so we can ignore any errors
                warnings.warn("Couldn't retrieve source code for container of "
                              "type " + obj.__name__ + ". It won't be checked "
                              "for correctness upon loading.")
            return ('module', obj, source_file, source)
        elif torch.is_storage(obj):
            storage_type = normalize_storage_type(type(obj))
            # Offset is always 0, but we keep it for backwards compatibility
            # with the old serialization format (which supported storage views)
            offset = 0
            obj_key = str(obj._cdata)
            location = location_tag(obj)
            serialized_storages[obj_key] = obj
            is_view = obj._cdata != obj._cdata
            if is_view:
                view_metadata = (str(obj._cdata), offset, obj.size())
            else:
                view_metadata = None

            return ('storage',
                    storage_type,
                    obj_key,
                    location,
                    obj.size(),
                    view_metadata)
        return None

    sys_info = dict(
        protocol_version=PROTOCOL_VERSION, #1001
        little_endian=sys.byteorder == 'little',
        type_sizes=dict(
            short=SHORT_SIZE,
            int=INT_SIZE,
            long=LONG_SIZE,
        ),
    )

    pickle_module.dump(MAGIC_NUMBER, f, protocol=pickle_protocol)
    pickle_module.dump(PROTOCOL_VERSION, f, protocol=pickle_protocol)
    pickle_module.dump(sys_info, f, protocol=pickle_protocol)
    pickler = pickle_module.Pickler(f, protocol=pickle_protocol)
    pickler.persistent_id = persistent_id  #重写稳固函数
    pickler.dump(obj)  #保存模型，保存模型中的所有参数

    serialized_storage_keys = sorted(serialized_storages.keys())
    pickle_module.dump(serialized_storage_keys, f, protocol=pickle_protocol)
    f.flush() #强行把缓冲区中的内容放到磁盘中
    for key in serialized_storage_keys:
        serialized_storages[key]._write_file(f, _should_read_directly(f))

#再torch的__init__.py文件中已经导入serialization.py所以 我们可以直接使用torch.save
a=torch.rand(10)
torch.serialization.save(a,'./test_save_tensor.pt')
class mode(torch.nn.Module):
    def __init__(self):
        super(mode,self).__init__()
        self.li=torch.nn.Linear(1,1)
    def forward(self,x):
        return self.li(x)
test_save_mode=mode()
torch.serialization.save(test_save_mode,'./test_save_mode.pt')

/opt/anaconda3/lib/python3.6/site-packages/torch/serialization.py:292: UserWarning: Couldn't retrieve source code for container of type mode. It won't be checked for correctness upon loading.
  "type " + obj.__name__ + ". It won't be checked "
/opt/anaconda3/lib/python3.6/site-packages/torch/serialization.py:292: UserWarning: Couldn't retrieve source code for container of type Linear. It won't be checked for correctness upon loading.
  "type " + obj.__name__ + ". It won't be checked "

2.载入模型

同样使用pickle载入模型

def load(f, map_location=None, pickle_module=pickle, **pickle_load_args):

    new_fd = False
    if isinstance(f, str) or \
            (sys.version_info[0] == 2 and isinstance(f, unicode)):
        new_fd = True
        f = open(f, 'rb')
    elif (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)):
        new_fd = True
        f = f.open('rb')
    try:
        return _load(f, map_location, pickle_module, **pickle_load_args)
    finally:
        if new_fd:
            f.close()

a=torch.load('./test_save_tensor.pt')
print(a)
test_load_mode=torch.load('./test_save_mode.pt')
x=torch.rand(10,1)
test_load_mode(x)

tensor([0.3686, 0.0043, 0.2659, 0.5096, 0.5824, 0.4283, 0.4433, 0.1747, 0.7562,
        0.3795])





tensor([[-0.6606],
        [-0.8127],
        [-0.2881],
        [-0.2881],
        [-0.2741],
        [-0.6020],
        [-0.8728],
        [-0.2843],
        [-0.7187],
        [-0.2810]], grad_fn=<AddmmBackward>)

9.tesnor.py

这个文件只要是定义一下Tensor类型的变量是一个类这个类中包含了几乎所有functional.py中的方法我们可以对Tensor量直接调用其中的方法。这是个父类将会被后面的tensor类所继承。后续的所有tensor方法都会继承。同时Tensor类继承了torch._C._TensorBase类这个类是c++类

10.autograd.grad_mode.py

这个文件主要是设定函数的可导性

class no_grad(object): #一个上下文关系的方式设定函数可导性

    """
    def __enter__(self):
        self.prev = torch.is_grad_enabled()
        torch._C.set_grad_enabled(False)

    def __exit__(self, *args):
        torch.set_grad_enabled(self.prev)
        return False

    def __call__(self, func):
        @functools.wraps(func)
        def decorate_no_grad(*args, **kwargs):
            with self:
                return func(*args, **kwargs)
        return decorate_no_grad

x=torch.rand(1,requires_grad=True)  
y=x*2
y.backward() #需要明确的是y为标量我们不需要输出参数。当输出为多个值时我们会求得雅可比矩阵此时我们需要对雅可比矩阵集行操作
print(y.requires_grad)
print(x.grad)
#当我们禁止求函数导数时可以这样

no_grad=torch.autograd.grad_mode.no_grad() #实例化一个类
#no_grad=torch.no.grad()也行
with no_grad:   #在这个下进行的是停止导数计算
    z=x**2
    print(z.requires_grad)

True
tensor([2.])
False

11.backends.cudnn.`init.py`

首先说一下CUDA，CUDA是GPU的计算框架，我们可以在这个框架下对GPU进行操作，实现某些任务，cudnn是CUDA上一种深度神经网络的工具。其中有个函数查看CUDA是否可用

def is_available(): 查看cuda的科用行
   
    return torch._C.has_cudnn

torch.backends.cudnn.is_available()

False

12.backends.cudnn.`rnn.py`

这个文件主要是配置RNN的dropout设置，和内部模式

import torch.cuda
import torch.backends.cudnn as cudnn

def get_cudnn_mode(mode):
    if mode == 'RNN_RELU':  
        return cudnn.CUDNN_RNN_RELU #0
    elif mode == 'RNN_TANH':
        return cudnn.CUDNN_RNN_TANH #1
    elif mode == 'LSTM':
        return cudnn.CUDNN_LSTM #2
    elif mode == 'GRU':
        return cudnn.CUDNN_GRU #3
    else:
        raise Exception("Unknown mode: {}".format(mode))


# NB: We don't actually need this class anymore (in fact, we could serialize the
# dropout state for even better reproducibility), but it is kept for backwards
# compatibility for old models.
class Unserializable(object):

    def __init__(self, inner):
        self.inner = inner

    def get(self):
        return self.inner

    def __getstate__(self):
        # Note: can't return {}, because python2 won't call __setstate__
        # if the value evaluates to False
        return "<unserializable>"

    def __setstate__(self, state):
        self.inner = None


def init_dropout_state(dropout, train, dropout_seed, dropout_state):
    dropout_desc_name = 'desc_' + str(torch.cuda.current_device())
    dropout_p = dropout if train else 0
    if (dropout_desc_name not in dropout_state) or (dropout_state[dropout_desc_name].get() is None):
        if dropout_p == 0:
            dropout_state[dropout_desc_name] = Unserializable(None)
        else:
            dropout_state[dropout_desc_name] = Unserializable(torch._cudnn_init_dropout_state(
                dropout_p,
                train,
                dropout_seed,
                self_ty=torch.uint8,
                device=torch.device('cuda')))
    dropout_ts = dropout_state[dropout_desc_name].get()
    return dropout_ts

13.backends.mkl.`init.py`

这个库是inter提供的用于数学计算的库，非常快，重要是用来做矩阵计算

def is_available():
    r"""Returns whether PyTorch is built with MKL support."""
    return torch._C.has_mkl

14.contrib

这个文件主要是用来可视化模型的，使用的tensorflow中的东西

15.cuda

这个问文件主要是用来对GPU进行操作的。包括资源回收，gpu通信等，还有随机数的生成.

16.distributed.`init.py`

这个主要是用来做分布式操作

def is_available(): #看分布式是否可用
    return hasattr(torch._C, "_c10d_init")

17.distributer.lunch.py

这个主要是实现分布式的操作，我也不懂涉及到机器间的通信和数据同步问题使用NCCL实现。以后看

18. distributions

这个是所有数据分布的实现，包括密度函数和采样函数等等这个详细分析一下，着节主要介绍各个分布的数学原理和应用场景。。分布是不能计算导数的。

1.伯努利分布

伯努利分布定义就是只有0和1取到1时的概率为p 取到0的概率为1-p 。这是一个类，该类包括均值方法，方差方法,熵方法，

from numbers import Number
import torch
from torch.distributions import constraints
from torch.distributions.exp_family import ExponentialFamily
from torch.distributions.utils import broadcast_all, probs_to_logits, logits_to_probs, lazy_property
from torch.nn.functional import binary_cross_entropy_with_logits
class Bernoulli(ExponentialFamily):

    arg_constraints = {'probs': constraints.unit_interval,
                       'logits': constraints.real}
    support = constraints.boolean
    has_enumerate_support = True
    _mean_carrier_measure = 0

    def __init__(self, probs=None, logits=None, validate_args=None):
        if (probs is None) == (logits is None):
            raise ValueError("Either `probs` or `logits` must be specified, but not both.")
        if probs is not None:
            is_scalar = isinstance(probs, Number)
            self.probs, = broadcast_all(probs)
        else:
            is_scalar = isinstance(logits, Number)
            self.logits, = broadcast_all(logits)
        self._param = self.probs if probs is not None else self.logits
        if is_scalar:
            batch_shape = torch.Size()
        else:
            batch_shape = self._param.size()
        super(Bernoulli, self).__init__(batch_shape, validate_args=validate_args)

    def expand(self, batch_shape, _instance=None):
        new = self._get_checked_instance(Bernoulli, _instance)
        batch_shape = torch.Size(batch_shape)
        if 'probs' in self.__dict__:
            new.probs = self.probs.expand(batch_shape)
            new._param = new.probs
        if 'logits' in self.__dict__:
            new.logits = self.logits.expand(batch_shape)
            new._param = new.logits
        super(Bernoulli, new).__init__(batch_shape, validate_args=False)
        new._validate_args = self._validate_args
        return new

    def _new(self, *args, **kwargs):
        return self._param.new(*args, **kwargs)

    @property
    def mean(self):  #均值方法
        return self.probs

    @property
    def variance(self):  #方差方法
        return self.probs * (1 - self.probs)

    @lazy_property
    def logits(self):
        return probs_to_logits(self.probs, is_binary=True)

    @lazy_property
    def probs(self):
        return logits_to_probs(self.logits, is_binary=True)

    @property
    def param_shape(self):
        return self._param.size()

    def sample(self, sample_shape=torch.Size()):
        shape = self._extended_shape(sample_shape)
        with torch.no_grad():
            return torch.bernoulli(self.probs.expand(shape))

    def log_prob(self, value):
        if self._validate_args:
            self._validate_sample(value)
        logits, value = broadcast_all(self.logits, value)
        return -binary_cross_entropy_with_logits(logits, value, reduction='none')

    def entropy(self):  #计算熵
        return binary_cross_entropy_with_logits(self.logits, self.probs, reduction='none')

    def enumerate_support(self, expand=True):
        values = torch.arange(2, dtype=self._param.dtype, device=self._param.device)
        values = values.view((-1,) + (1,) * len(self._batch_shape))
        if expand:
            values = values.expand((-1,) + self._batch_shape)
        return values

    @property
    def _natural_params(self):
        return (torch.log(self.probs / (1 - self.probs)), )

    def _log_normalizer(self, x):
        return torch.log(1 + torch.exp(x))

伯努利分布的分布函数为： $P(X=1)=p; P(X=0)=1-p$ 这个函数用在dropout时限制神经元开关

b=torch.distributions.bernoulli.Bernoulli(0.5)
b.sample([10,10])#输出伯努利分布的矩阵 
b.mean  #输出均值
b.variance#输出方差
b.entropy()#输出均值 在类中entropy()方法没有被@property装饰所以需要加括号

tensor(0.6931)

熵值表述数据的混乱程度，越混乱熵值越大。定义概率的熵需要知道样本值和对应样本的概率值
$H(U)=-\sum_{i=1}^{n}P_i*log(p_i)$ 其中 $ P_i $ 表示第i个样本值，$ p_i $ 表示第i个样本概率，伯努利分布只有两个样本值所以概率只有两个.

2.Beta分布

Beta分布可以理解为概率的概率分布，其样本值在0，1之间，由两个参数控制。公式非常复杂我们可以理解为某种概率的先验分布。它同样包含均值方差和熵值
它由两个参数决定a和b。 a和b可以表示成我们的观测结果的次数。a表示成功次数b表示失败次数。

b=torch.distributions.beta.Beta(0.2,0.4)
b=torch.distributions.Beta(4,0.4)
#参数一定的情况下会获得在这个参数下的样本值
b.sample([2,3])

tensor([[0.9294, 0.9856, 0.9387],
        [0.9661, 0.5526, 0.9785]])

3.二项分布

二项分布就是n重独立伯努利分布。表示重复n次试验发生k次的概率。不能求熵密度函数： $P(K)=C(n,k)*p^k*p^{(n-k)}$

#一共会出现100次每次的概率是0.2 则会发生正样本的次数是多少
#这里要求100是0.2的广播形式
b=torch.distributions.Binomial(2,0.5)
b.sample()
print(b.mean)#均值
b=torch.distributions.Binomial(torch.tensor([100]),torch.tensor([0.2,0.4]))  
b.sample() #发生100次第一轮的概率是0.2第二类的概率是0.4 对应正样本的次数

tensor(1.)

tensor([29., 39.])

4.柯西分布

直接看公式把概率密度函数 $f(X;x,y)=\frac{1}{\pi}*\frac{y}{(X-x)^2+y^2}$ 其中的控制变量是x和y 所以需要两个输入

c=torch.distributions.Cauchy(0,1) #当x，y取0，1时以概率密度获得某个值
c.sample() #获得一个X值
c.entropy()# 熵值

tensor(2.5310)

5.卡方分布

卡方分布是由n个正态分布的平方叠加得到。n表示卡方分布的自由度。

import torch
c=torch.distributions.Chi2(4) #2表示卡方分布的自由度 
c.mean #均值为n
c.variance  #方差为2*n

tensor(8.)

6.狄里克里分布

与beta分享类似只不过是多元参数。依然表示的是概率的概率

d=torch.distributions.Dirichlet(torch.tensor([0.5,0.5]))
d.sample()  #返回一个类似概率的值

tensor([0.5724, 0.4276])

7.指数分布

指数分布表示一种随时间变化的分布。表示某个量在某个时间间隔中发生的概率，预测机器故障率可以使用指数分布。指数分布的密度公式为： $P(y)=1-\exp^{\lambda*y} y>=0$

e=torch.distributions.Exponential(2) #当λ取2时y的可能值
e.sample()

tensor(0.0071)

8.几何分布

几个分布就是伯努利分布在第n次发生的分布

#当伯努利的概率为0.2时在第几次出现正样本
g=torch.distributions.geometric.Geometric(0.5)
g.sample()

tensor(0.)

9.泊松分布

一段时间内事件的发生次数，服从泊松分布。
密度函数一个过程中发生k次事件的概率
$P(X=k)=\frac{\lambda^k}{k!}*\exp^{-\lambda}$

#在给定λ的前提下一般会发生几次 。我们在使用泊松分布产生泊松随机数的时候使用，
#比如假设有10场球赛每场球赛的进去数可以用这个分布生成,
p=torch.distributions.Poisson(5)  
p.sample()
p.mean

tensor(5.)

10.均匀分布

给定分布的范围我们可以轻松获得一个均匀随机数

u=torch.distributions.Uniform(0,100)
u.sample()
u.entropy()

tensor(4.6052)

11.正态分布

需要给定均值和方差

n=torch.distributions.Normal(10,1)
n.sample()

tensor(10.4415)

19.jit

这个文件主要是对模型进行编译处理。jit意思是只在运行时编译代码

20.nn

这个是最重要的文件

1.参数模块

这个模块相当于定义一种数据Parameter数据，这个数据默认自带导数，再计算过程中相当于节点，一般用在初始化模型阶段。

import torch
from collections import OrderedDict


class Parameter(torch.Tensor):

    def __new__(cls, data=None, requires_grad=True):
        if data is None:
            data = torch.Tensor()
        return torch.Tensor._make_subclass(cls, data, requires_grad)

    def __deepcopy__(self, memo):
        if id(self) in memo:
            return memo[id(self)]
        else:
            result = type(self)(self.data.clone(memory_format=torch.preserve_format), self.requires_grad)
            memo[id(self)] = result
            return result

    def __repr__(self):
        return 'Parameter containing:\n' + super(Parameter, self).__repr__()

    def __reduce_ex__(self, proto):
        # See Note [Don't serialize hooks]
        return (
            torch._utils._rebuild_parameter,
            (self.data, self.requires_grad, OrderedDict())
        )

import torch
class mode(torch.nn.Module):
    def __init__(self):
        super(mode,self).__init__()
        self.L1=torch.nn.Linear(2,1) #我们操作模型参数的时候需要使用这个变量
        self.L1.weight=torch.nn.Parameter(torch.ones_like(self.L1.weight.data))
        self.L1.bias=torch.nn.Parameter(torch.ones_like(self.L1.bias.data))
    def forward(self,x):
        return self.L1(x)
a=torch.rand(1,2)
mo=mode()
out=mo(a)
out

tensor([[1.7614]], grad_fn=<AddmmBackward>)

2.函数模块

这个主要是一些常用的计算卷积梯度，损失函数等

1.分数阶最大池化

这个主要是准对不同形状输出不能整数切分的池化，在传统上我们的池化作用就是缩小一倍，尺寸必须满足需求。这个使用分数阶池化不需要。我们只需要知道输入数据和输出数据的大小，同时设定池化核

#二维池化
import torch
a=torch.rand(2,16,23,27) 
out=torch.nn.functional.fractional_max_pool2d_with_indices(a,3,output_size=(10,11))
out[0].shape #返回一个元组包括池化后的数据核位置索引
#三五分数阶池化
a=torch.rand(2,2,16,27,24)
out=torch.nn.functional.fractional_max_pool3d_with_indices(a,3,output_size=(3,11,16))
out[0].shape

torch.Size([2, 2, 3, 11, 16])

2.池化下采样

包含最大池化核均值池化。我们需要明确池化核步长padding。在nn模型中我们调用的对象就是调用这个方法实现池化。具体的会介绍池化对象的操作

import torch
#一维池化
a=torch.rand(2,5,56) #2个数据每个数据点是一个5维向量，这个数据长读为56
out=torch.nn.functional.max_pool1d(a,3,1,1)
out.shape
#二维池化
a=torch.rand(2,32,16) #2个数据每个数据的大小是32*16
out=torch.nn.functional.max_pool2d(a,(3,2),(1,2),(1,0)) #我们通过卷积核步长padding可以计算出输出的大小
out.shape
#三维池化
a=torch.rand(2,32,34,34) #2个数据每个数据大小是32*34*34
out=torch.nn.functional.max_pool3d(a,(3,2,2),(1,2,2),(1,0,0)) #我们通过卷积核步长padding可以计算出输出的大小
out.shape

torch.Size([2, 32, 17, 17])

3.power-average pooling

这个池化讲的是先将数据进行求p次方再均值池化

def lp_pool2d(input, norm_type, kernel_size, stride=None, ceil_mode=False):
    kw, kh = utils._pair(kernel_size)
    if stride is not None:
        out = avg_pool2d(input.pow(norm_type), kernel_size, stride, 0, ceil_mode)
    else:
        out = avg_pool2d(input.pow(norm_type), kernel_size, padding=0, ceil_mode=ceil_mode)

    return (torch.sign(out) * relu(torch.abs(out))).mul(kw * kh).pow(1. / norm_type) #最后的操作需要考虑

a=torch.linspace(1,16,16).view(1,4,4)
out=torch.nn.functional.lp_pool2d(a,2,2,2)
out

tensor([[[ 8.1240, 11.7473],
         [23.3666, 27.3130]]])

5.自适应池化

可以自动匹配输出的大小

import torch
a=torch.rand(1,2,4,4)
out=torch.nn.functional.adaptive_avg_pool2d(a,[2,3]) #必须定义输出大小
out

tensor([[[[0.6362, 0.5823, 0.5884],
          [0.4093, 0.2965, 0.3800]],

         [[0.4667, 0.5804, 0.5920],
          [0.5652, 0.7996, 0.6868]]]])

6.dropout

目的是随机使输入变为0 ，使用伯努利分布以概率p将数据变为0。p是变为0的概率。这个过程中其他不为0的数据也会改变，改变的目的是保证分布的可靠性。改变的方法是原始数据乘$\frac{1}{1-p}$

import torch
a=torch.rand(5,5)
out=torch.nn.functional.dropout(a,0.2)
print(a)
out

tensor([[0.2352, 0.1480, 0.6943, 0.7124, 0.2272],
        [0.3813, 0.1807, 0.6844, 0.3258, 0.9784],
        [0.4323, 0.7016, 0.2289, 0.6965, 0.4892],
        [0.4658, 0.0834, 0.5206, 0.2358, 0.2623],
        [0.1379, 0.4536, 0.4010, 0.0882, 0.4479]])





tensor([[0.2941, 0.1851, 0.8679, 0.8905, 0.2840],
        [0.4767, 0.2259, 0.8555, 0.4072, 1.2230],
        [0.5404, 0.8770, 0.2861, 0.0000, 0.6115],
        [0.5823, 0.1043, 0.6507, 0.2947, 0.3279],
        [0.1724, 0.5671, 0.5012, 0.1102, 0.5598]])

out=torch.nn.functional.alpha_dropout(a,0.8,training=True) #对于0均值单位方差的数据这个方法可以保证数据的分布不会改变

a=torch.rand(5,5)
drop=torch.nn.AlphaDropout(0.2) #这个类中会调用torch.nn.functional.alpha_dropout这个方法
drop(a)

tensor([[ 0.6493,  0.5251,  0.8388,  1.0482,  0.6299],
        [ 1.1602,  1.0415,  0.3951, -1.2362,  0.3714],
        [ 0.4936,  0.4722, -1.2362,  0.7264,  0.9774],
        [-1.2362,  0.6666,  0.4255,  1.0837,  0.4742],
        [-1.2362,  0.3625,  0.8958,  0.4409,  0.3659]])

a=torch.rand(3,5,5)
torch.nn.functional.dropout3d(a,0.5)  #以伯努利分布概率为p的方法随机是通道上数据变为0

tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1649, 0.9383, 1.6088, 0.4794, 0.0733],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [1.8181, 1.9892, 0.4018, 1.6937, 1.8324]],

        [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.7860, 0.2763, 1.6730, 1.5036, 0.8810],
         [0.7210, 0.1779, 0.0377, 0.8876, 0.1604],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5841, 0.0303, 0.9604, 1.2321, 1.3056]],

        [[1.5632, 1.0990, 0.6627, 1.2321, 1.7645],
         [0.4348, 0.5744, 1.3716, 0.5088, 1.3694],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.6206, 1.1528, 1.6294, 0.8692, 0.0500],
         [1.1383, 0.5124, 0.4202, 0.8069, 0.9987]]])

7.根据阈值过滤数据

对于一个数据我们通过设定阈值核代替值过来

import torch
a=torch.rand(4,4)-0.5
torch.nn.functional.threshold(a,0,10)  #将数据，以阈值0过滤，小于0的用10补全

tensor([[10.0000,  0.1467, 10.0000, 10.0000],
        [10.0000,  0.1929,  0.4668,  0.1881],
        [ 0.3089, 10.0000,  0.4236, 10.0000],
        [10.0000, 10.0000, 10.0000, 10.0000]])

\[y = \begin{cases} x, &\text{ if } x > \text{threshold} \\ \text{value}, &\text{ otherwise } \end{cases}\]

8.ReLU

这里有个操作是in-place操作。默认不是in-place

import torch
a=torch.rand(2,2)-0.5
torch.nn.functional.relu(a)

tensor([[0.0000, 0.0000],
        [0.0000, 0.2612]])

\[\text{ReLU}(x)= \max(0, x)\]

9.glu

门控函数，这个函数对输出做处理首先在某个维度上将数据切分成两份。a和b然后进行计算操作符表示对应位置相乘 $\text{GLU}(a, b) = a \otimes \sigma(b)$ $\sigma(b)=\frac{1}{1+e^{-b}}$

#默认维度为-1
import torch
a=torch.linspace(1,4,4).view(2,2)
print(torch.nn.functional.glu(a))
a[:,0].mul(1/(1+torch.exp(-a[:,1])))

tensor([[0.8808],
        [2.9460]])

tensor([0.8808, 2.9460])

10.softmax

这里我我们需要主注意的是维度，在哪个维度上进行计算需要明确 $\text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}$

import torch
a=torch.linspace(1,4,4).view(2,2)
print(torch.nn.functional.softmax(a,1))#1表示对列进行计算，计算结果在行上加和为1

tensor([[0.2689, 0.7311],
        [0.2689, 0.7311]])

11.log_softmax

这个会作为最后的损失计算。只softmax的基础上求log值

import torch
a=torch.linspace(1,4,4).view(2,2)
print(torch.nn.functional.log_softmax(a,1))#1表示对列进行计算，计算结果在行上加和为1
torch.log(torch.nn.functional.softmax(a))

tensor([[-1.3133, -0.3133],
        [-1.3133, -0.3133]])

<ipython-input-22-94706291faa6>:4: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  torch.log(torch.nn.functional.softmax(a))

tensor([[-1.3133, -0.3133],
        [-1.3133, -0.3133]])

12.tanh

常用在图像生成的最后一层激活上。我们不需要考虑为维度。因为每个元素都会计算 $\tanh(x) = \frac{\exp(x) - \exp(-x)}{\exp(x) + \exp(-x)}`$

import torch
a=torch.linspace(1,4,4).view(2,2)
print(torch.nn.functional.tanh(a))

tensor([[0.7616, 0.9640],
        [0.9951, 0.9993]])

13.sigmoid

没什么好说的 $\text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}$

import torch
a=torch.linspace(1,4,4).view(2,2)
print(torch.nn.functional.sigmoid(a)) 
#在通常情况下我们可以使用 torch.sigmod()

tensor([[0.7311, 0.8808],
        [0.9526, 0.9820]])

14.linear

线性模块也就是神经网络的计算过程。需要输出，权重和偏置。需要注意的是以W的转置相乘 $y = xW^T + b$

import torch
a=torch.linspace(1,4,4).view(2,2)
print(torch.nn.functional.linear(a,a)) 

tensor([[ 5., 11.],
        [11., 25.]])

15.bilinear

双线性转化。需要两个输出和一个权值一个偏置 $y = x_1 W x_2 + b$

import torch
a=torch.linspace(1,4,4).view(2,2)
w=torch.rand(10,2,2) #W需要设定我们想要的输出维度
print(torch.nn.functional.bilinear(a,a,w)) 

tensor([[ 1.8529,  5.7235,  5.2965,  4.9167,  5.2348,  2.2636,  3.0703,  4.3109,
          2.2417,  4.9606],
        [11.5514, 30.9755, 30.2340, 24.8662, 25.6934, 11.2365, 18.4914, 24.9759,
         11.7829, 30.5966]])

16.embedding

词嵌入，这看上去高大上的东西是什么呢，简单说就是一个通过索引映射矩阵，首先需要一个词库在这个词库加入包含有10个词对这个词库排序然后对应序号。有一句话这句话用到了其中的5个词，构成一个向量，此时我们想将词映射成三维需要我们做啥呢，首先初始化一个10*3的矩阵。然后更加词的索引去这个矩阵中在对应位置的向量就完成了映射

import torch
w=torch.linspace(1,30,30).view(10,3)
a=torch.tensor([1,3,2,5,3])
torch.nn.functional.embedding(a,w)  #这就是这句话的映射向量。在模型中这个w需要训练才会起到相应作用

tensor([[ 4.,  5.,  6.],
        [10., 11., 12.],
        [ 7.,  8.,  9.],
        [16., 17., 18.],
        [10., 11., 12.]])

17.embedding_bag

import torch
w=torch.linspace(1,30,30).view(10,3)
a=torch.tensor([1,3,2,5,3])
offsets = torch.tensor([0,2,3])
torch.nn.functional.embedding_bag(a,w,offsets) 

tensor([[ 7.,  8.,  9.],
        [ 7.,  8.,  9.],
        [13., 14., 15.]])

18.batch_norm

通常情况下我们会对数据的第2个维度进行标准化

w=torch.linspace(1,16,16).view(2,2,2,2)
torch.nn.functional.batch_norm(a,torch.tensor([1.,1.,1]),torch.tensor([1.,1.,1]))

tensor([[[-0.8834, -0.7371, -0.1310, -0.1510],
         [-0.1882, -0.8981, -0.3473, -0.8744],
         [-0.2580, -0.6774, -0.6342, -0.9110]],

        [[-0.1642, -0.1350, -0.2202, -0.1952],
         [-0.0282, -0.0168, -0.1113, -0.6562],
         [-0.3089, -0.7730, -0.3884, -0.3499]]])

19.ctc损失

这个损失特点很强，首先说他是干嘛的，用来衡量两个序列的相似度。首先RNN的输出个数要大于实际序列的长度，通关转化RNN输出可以得到一个目标序列也就是多对一的问题，有一个目的地但是又多条路线而且这些路线都正确，每个路线的概率和就是获得这个结果的概率。我们呢需要最大化这个概率，这就是ctc损失

# 在图像数据中维度表示 (N,C,W,H) batch-size 通道数，宽，高
# 在序列数据中维度表示(T,N,C) 序列长度，batch-szie，词典长度
log_probs = torch.randn(1, 16, 20).log_softmax(2) #RNN输出概率这个长度是固定的
targets = torch.randint(1, 20, (16, 30), dtype=torch.long) #此时我们的目标长度不是固定的最长为30
input_lengths = torch.full((16,), 50, dtype=torch.long)
target_lengths = torch.randint(10,30,(16,), dtype=torch.long)
loss =torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)

20.负对数损失

对于分类任务我们可以使用这个损失，其实就是输出概率的最大似然， $\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad l_n = - w_{y_n} x_{n,y_n}, \quad w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},$ $\ell(x, y) = \begin{cases} \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, & \text{if reduction} = \text{'mean';}\\ \sum_{n=1}^N l_n, & \text{if reduction} = \text{'sum'.} \end{cases}$

import torch
a=torch.rand(4,5)#有四个样本。有5种类别
b=torch.randint(0,4,(4,)) #每个样本的类别是索引
torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(a,1),b)

tensor(1.5282)

21.KL散度

衡量两个分布的相似程度， $l(x,y) = L = \{ l_1,\dots,l_N \}, \quad l_n = y_n \cdot \left( \log y_n - x_n \right)$

a=torch.rand(2,3)
b=torch.rand(2,3)
torch.nn.functional.kl_div(a,b)

tensor(-0.5370)

22.交叉熵

和我们理解的不太一样。数学上我们需要再乘一个类别值。第二个公式表示有权重的计算，权重表示类别不均很的时候在代码计算过程中也没有尊学这个计算过程，返回的是负对数损失的值

def cross_entropy(input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean'):
    if size_average is not None or reduce is not None:
        reduction = _Reduction.legacy_get_string(size_average, reduce)
    return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)

$\text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right) = -x[class] + \log\left(\sum_j \exp(x[j])\right)$ $\text{loss}(x, class) = weight[class] \left(-x[class] + \log\left(\sum_j \exp(x[j])\right)\right)$

import torch
a=torch.rand(4,5) #4个样本，一共有5种类别
b=torch.randint(0,4,(4,))
torch.nn.functional.cross_entropy(torch.softmax(a,1),b) 

tensor(1.5893)

23.二分类交叉熵

二分类交叉熵可以定义成 sigmoid输出的交叉熵 $\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],$

a=torch.rand(100,1) #一个输出的二分类神经网络，有100个数据
b=torch.rand(100,1)
torch.nn.functional.binary_cross_entropy(a,b)

tensor(1.0008)

24.L1损失

也叫绝对值损失，多用在回归任务中我们也可以将分类任对目标数据索引话后作为分类任务使用

a=torch.rand(4,4)
b=torch.randint(0,3,(4,4))
torch.nn.functional.l1_loss(a,b)

tensor(0.9287)

25.L2损失平方差损失

a=torch.rand(4,4)
b=torch.randint(0,3,(4,4))
torch.nn.functional.mse_loss(a,b)

tensor(1.4617)

26.还有一些多标签的损失一个结果中的输出有几个都属对的

27.插值

所谓插值就是采样，会改变数据的大小，当数据变化的时候我们以何种方式补全数据就是插值方法 ,有几点需要注意，首先采样的大小不在通道上和batch-szie上采样。所以当是4维的时候我们设置size大小是2

import torch
a=torch.rand(4,4,4,4) 
torch.nn.functional.interpolate(a,size=(2,1),mode='nearest').shape

torch.Size([4, 4, 2, 1])

3.模型类模块

这个模块是个符合面向对象编程的模块，将很对方法抽象成类。我们在构建模型的时候继承基础的类就可以了，加入我们有个网络架构，这个架构已经想好，何时用卷积何时求损失等等。我们只需要继承基础类再这些类的基础上进行扩展即可，这个要分析模块是torch的灵魂，我们构建模型主要是通过这个模块来实现。其中定义了很多基础类包括卷积，激活，损失等等。一个模型的整个过程都可以再这个包种找到

1.激活函数

这个模块下主要是一些激活函数类，这些类的实现主要是调用nn.functional下的方法。同时我们需要首先继承module基础类，这个基础类主要是存一些带有参数的激活函数的参数。同时设定一些可操作方法，在这个基础类种我们需要复写初始化函数并继承基础类的初始化，还需要复写forward函数。也就是前向传播

1.阈值激活

这个函数我们需要设定阈值和一个value值 $$ y = \begin{cases} x, &\text{ if } x > \text{threshold} \\ \text{value}, &\text{ otherwise } \end{cases}$

import torch.nn as nn
import torch
at=nn.Threshold(0.1,1)
a=torch.rand(2,3)-0.5
at(a) #当值小于0.1的时候我们使用1代替

tensor([[1.0000, 0.4016, 0.3524],
        [1.0000, 0.4279, 1.0000]])

2.ReLu激活

当x取值小于0的时候使用0代替当x大于0的时候返回原值 $\text{ReLU}(x)= \max(0, x)$

import torch.nn as nn
import torch
at=nn.ReLU()
a=torch.rand(2,3)-0.5
at(a) #当值小于0的时候我们使用0代替

tensor([[0.1606, 0.0000, 0.0000],
        [0.3244, 0.3049, 0.3659]])

3.Sigmoid

我们可以认为大写的都是类小写的都是方法，类通过方法实现相应的功能。在构建模型输出的时候将输出数据通过这个函数。这些方法都要继承 module父类，主要原因是module类可以将函数加在某个节点上。节点的输出结果会包含这个函数的导数信息。 $\text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}$

import torch.nn as nn
import torch
at=nn.Sigmoid()
a=torch.rand(2,3)-0.5
at(a) 

tensor([[0.4678, 0.5581, 0.3934],
        [0.6086, 0.5408, 0.4683]])

4.Tanh

$\text{Tanh}(x) = \tanh(x) = \frac{e^x - e^{-x}} {e^x + e^{-x}}$

import torch.nn as nn
import torch
at=nn.Tanh()
a=torch.rand(2,3)-0.5
at(a) 

tensor([[ 0.2292, -0.3429, -0.2950],
        [ 0.3734, -0.1245,  0.0764]])

5.SELU

重点说一下这个函数，这个函数具有非常好的性质，不会影响输入数据的分布，尤其是正态分布。这个激活非常适合全连接网络同时它有两个参数 $\text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))$ $\alpha = 1.6732632423543772848170429916717$ $\text{scale} = 1.0507009873554804934193349852946$

import torch.nn as nn
import torch
at=nn.SELU()
a=torch.randn(2,3)
at(a).mean() #在小数据量上有影响

tensor(0.6070)

6.LeakrRelu

这个也比较常用，他需要一个参数通常这个参数我们传0.2 $\text{LeakyRELU}(x) = \begin{cases} x, & \text{ if } x \geq 0 \\ \text{negative\_slope} \times x, & \text{ otherwise } \end{cases}$

import torch.nn as nn
import torch
at=nn.LeakyReLU(0.2)
a=torch.randn(2,3)
at(a) 

tensor([[-0.1619, -0.2191,  1.3814],
        [-0.0629,  1.4432,  0.7876]])

7.Softmax

比较常用，还有几个不常用的就不介绍了，不常用的都是以发论文二产生的在实际应用种并没有比这几个好多少 $\text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$

import torch.nn as nn
import torch
at=nn.Softmax(1) #这里我们需要确定一个维度，也就是说在哪个维度上进行计算
a=torch.randn(2,3)
at(a) 

tensor([[0.0496, 0.9057, 0.0447],
        [0.7429, 0.0896, 0.1675]])

2.标准化

标准化就是对数据剪均值除标准差，这是是理论上的标准化。在实际中一个批次数据过来后我们首先计算样本的均值方差。然后得到变化数据。同时为了保准数据不会出现过大的偏移我们还需要再乘一个v再加一个m，此时将数据输入到下以单元。这是实际上标准化的操作。v和m是我们随机初始化的数据需要进行训练。

#一维标准化，针对序列
import torch.nn as nn
import torch
at=nn.BatchNorm1d(5) #这里我们需要确定一个个数表示标准化的数据的个数
a=torch.linspace(1,10,10).view(2,5)
at(a) 
at.weight #待训练值
at.bias

Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)

#二维标准化针对图像
 #3表示三通道一张图片每个通道都 有个均值和方差
at=nn.BatchNorm2d(3) #这里我们需要确定一个个数表示标准化的数据的个数
a=torch.linspace(1,300,300).view(1,3,10,10)
at(a) 
at.weight #待训练值
at.bias

Parameter containing:
tensor([0., 0., 0.], requires_grad=True)

3.模型容器

所谓模型容器，就是我们可以将模型存放再一个容器中，这个容器具有模型不具备的方法和操作

1.Sequential

这个容器包换模型具有顺序性。也就是说我们再运行这个容器的时候。容器中的内容会顺序执行。这个类包含有__setitem__()和__getitem__()的方法所以我们可用列表的方式进行访问

import torch
import torch.nn as nn
c_mode=nn.Sequential(nn.Linear(3,4),nn.Linear(4,1))
a=torch.rand(2,3) #两个数据 每个数据大小是3
c_mode(a) #我们可以看到数据从容器中的模型是依次通过。

tensor([[ 0.0203],
        [-0.1002]], grad_fn=<AddmmBackward>)

c_mode[-1]=nn.Linear(4,1)

2.ModuleList

与Sequential相似只不过这个是不能执行的他的性质和列表一样，与Sequential不同的是不能直接添加模型

import torch
import torch.nn as nn
m_mode=nn.ModuleList()
m_mode.append(nn.Linear(3,4)) #需要我们手动添加元素。这个可以理解成一个列表
m_mode.append(nn.Linear(4,1))
a=torch.rand(2,3) #两个数据 每个数据大小是3
m_mode[0](a) #我们可以看到数据从容器中的模型是依次通过。

tensor([[-0.2279, -0.2001,  0.6203, -0.0337],
        [-0.2635, -0.2784,  0.6427, -0.0402]], grad_fn=<AddmmBackward>)

4.卷积

这里主要有两种操作，1是卷积2是反卷积

1.卷积

我就不明说了主要说说他的使用规则
我们定义数据的方式是(N，C，W，H) 分别是样本个数，样本通道数，样本的大小。这个方式与RNN定于的数据的方式有些不同。RNN会将样本个数放在第二个维度上

#一维卷积
import torch
import torch.nn as nn
conv1=nn.Conv1d(2,16,2,2,0) #输入通道是2输出通道1，卷积核大小是2，步长是2，
a=torch.rand(2,2,16) #2个样本每个样本有两个通道，每个样本长度为16
conv1(a).shape

torch.Size([2, 16, 8])

在卷积中我们有个分组的概念，分组卷积可以减少参数量。加入我们有一个样本这个样本大小为64*128*128也就是说也有64个通道。此时我们想生成128通道的数据。我们需要做的是创建128个卷积核每个卷积核大小为64*3*3所以参数量为 128*64*3*3,当我们对数据进行分组后就不一样了，加入分两组，每组的大小为32*128*128,每组生成一个数据合并后需要具有128通道，所以每个通道的卷积核是64*32*3*3 总参数为2*64*3*3 相比原始的方法减少了2倍参数量

## 二维卷积
##我们输入的卷积核是个int值其实在后面会变成一个元组，在样本大小的维度上进行计算
import torch
import torch.nn as nn
conv1=nn.Conv2d(3,64,3,1,1) #输入通道是3输出通道64，卷积核大小是3，步长是1，
a=torch.rand(1,3,8,8) #1个样本每个样本有3个通道，每个样本大小为8*8
conv1(a).shape   

torch.Size([1, 64, 8, 8])

## 三维卷积
##实际涨都会变成2维卷积
import torch
import torch.nn as nn
conv1=nn.Conv3d(3,64,3,1,1) #输入通道是3输出通道64，卷积核大小是3，步长是1，
a=torch.rand(1,3,5,8,8) #1个样本每个样本有3个通道，通道的深度为5每个样本大小为8*8
conv1(a).shape 

torch.Size([1, 64, 5, 8, 8])

2.反卷积

所谓反卷积就是小变大的过程，具体实现是首先对特征图进行插值变化，生成新图，然后对卷积核参数变化。最后再进行卷积计算。说一下输出的大小计算方法
$H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{dilation}[0] \times (\text{kernel\_size}[0] - 1) + \text{output\_padding}[0] + 1$

\[W_{out} = (W_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1] + \text{dilation}[1] \times (\text{kernel\_size}[1] - 1) + \text{output\_padding}[1] + 1\]

##二维反卷积
import torch
import torch.nn
t_conv2=nn.ConvTranspose2d(64,3,3,2,1)
a=torch.rand(1,64,5,5)
t_conv2(a).shape

torch.Size([1, 3, 9, 9])

\[H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0]\]

5.距离

计算向量的距离，有点需要主要我们计算的是向量的距离有两种方法向量距离也就是对应位置的差值为x $\Vert x \Vert _p = \left( \sum_{i=1}^n \vert x_i \vert ^ p \right) ^ {1/p}.$ 余弦距离 $\text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2 \cdot \Vert x_2 \Vert _2, \epsilon)}$

a=torch.rand(2,10) #两条数据，每条数据长10
b=torch.rand(2,10)
d=nn.PairwiseDistance(1)
d(a,b)

tensor([2.4561, 3.3524])

a=torch.rand(2,10) #两条数据，每条数据长10
b=torch.rand(2,10)
d=nn.CosineSimilarity(1)#这里的参数是维度。考虑维度
d(a,b)

tensor([0.7062, 0.7854])

6.Dropout

前面已经详细分析过，这里需要注意，当模型半酣dtopout的时候我们在测试阶段需要将模型设为测试模型。此时dropout失去作用

a=torch.rand(2,2)
drop=nn.Dropout(0.5)
drop(a)

tensor([[0.0000, 0.4365],
        [0.0000, 0.0000]])

7.Flatten

也就是拉平，这里可以拉平在内存空间中连续的数据，然后同样生成连续的内存空间中的一维数据。这里连续很重要，在很多模型中只接收在内存中连续的数据，有些操作会把数据变得不连续比如转置，维度变化等

a=torch.rand(2,3,4)#在内从中是横向排列的，

c=a.permute(1,0,2).contiguous() #交换某个维度维度,会导致内存不连续 加上contiguous()可以使空间连续
f=nn.Flatten(0) #这里的参数0表示拉平
f(c)

tensor([0.6422, 0.9726, 0.2726, 0.2533, 0.9132, 0.2305, 0.7615, 0.1440, 0.5451,
        0.4460, 0.4498, 0.2538, 0.0158, 0.1864, 0.1156, 0.8761, 0.8167, 0.9152,
        0.2738, 0.1140, 0.1241, 0.4790, 0.4622, 0.8811])

8.Linear

线性模块。也就是神经元模块

import torch
import torch.nn as nn
L=nn.Linear(2,3)#输入两个输出3个
L.weight

Parameter containing:
tensor([[ 0.5858,  0.3569],
        [ 0.5500, -0.1393],
        [-0.6247, -0.1912]], requires_grad=True)

9.损失计算

主要介绍几个常用的损失函数的使用原理应用场景核注意事项

1.L1损失

这是个没有参数的损失，会让权值变得稀疏，也就是说有些权值会变成0 $\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad l_n = \left| x_n - y_n \right|,$

# 两个矩阵大小一样 维度一样，或者可以广播成一样维度
import torch
import torch.nn as nn
a= torch.linspace(1,4,4).view(2,2)
b=torch.linspace(2,4,2).view(2,1)
Loss_f=nn.L1Loss()
Loss_f(a,b)

tensor(0.5000)

2.负对数损失

用在分类上我们又3个类别，也就是三个输出，目标值就是输出中的一个我们需要的输出是一个概率 $\ell(x, y) = \begin{cases} \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, & \text{if reduction} = \text{'mean';}\\ \sum_{n=1}^N l_n, & \text{if reduction} = \text{'sum'.} \end{cases}$

a=torch.rand(1,3)
a=torch.softmax(a,-1)

b=torch.tensor([0])
NL_f=nn.NLLLoss()
loss=NL_f(a,b)
loss

tensor(-0.4717)

tensor([[0.4717, 0.2668, 0.2615]])

3.平方差损失

这里注意的是在实现上没有乘1/2 $\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad l_n = \left( x_n - y_n \right)^2,$

#对应位置元素计算
a=torch.rand(2,3)
b=torch.rand(2,3)
Mse=nn.MSELoss()
Mse(a,b)

tensor(0.2526)

#等价于这个
sum(sum((a-b)*(a-b)))/6

tensor(0.2526)

4.二分类交叉熵损失

两个类别。输出表示概率所以要求目标值是浮点型数据 W表示每种样本的分布情况 $\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right]$

#对应位置的元素进行计算。
import torch
import torch.nn as nn
a=torch.rand(2,3) #表示样本的概率
b=torch.rand(2,3)
Loss_f=nn.BCELoss()
Loss_f(a,b)

tensor(0.7587)

-sum(sum(torch.log(a)*b+(1-b)*torch.log(1-a)))/6 

tensor(0.7587)

5.交叉熵损失

在分类任务中多个类别同时每个类别会有不同的占比可以使用这个函数。
输入需要满足索引对应。加入有3个输出，我们的标签就是这三个的位置索引,在代码上其实并不是按公式来计算交叉熵的。而是又加了一个log_softmax $\text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right) = -x[class] + \log\left(\sum_j \exp(x[j])\right)$

\[\text{loss}(x, class) = weight[class] \left(-x[class] + \log\left(\sum_j \exp(x[j])\right)\right)\]

#对应位置的元素进行计算。

import torch
import torch.nn as nn
a=torch.rand(2,3) #表示两个数据。每种数据可能是三个类别
b=torch.randint(0,3,(2,)) #这里需要注意是一维的。
Loss_f=nn.CrossEntropyLoss()
Loss_f(a,b)

tensor(0.9481)

6.CTC损失

是一种多对一的损失计算。我输出有很多，但是经过一种固定的变化会变化成一个我们想要的输出。把所有的输出概率加起来就是我们需要计算梯度的量。

T = 50 #输出序列的长度，也就是步长。
C = 20 #类别是词库大小包括空格
N = 16 #一共又多少个样本
S = 30 #目标序列的最大长度
S_min = 10 #目标序列的最小长度
#如何理解input呢，T表示了又多少个RNN的块，N表示一次多少样本，C表示一个RNN快输出是多大也就是词库多大
input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_() 
#目标值就是字典的索引，N表示有多少样本，S表示一个目标值的最大长度，我们知道对于句子
#目标句子有的长 5个字有的长7个字，我们的限制条件是不能超过最大长度，同时我们需要将
#5个字或者7个子都padding成最大长度
target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
#输入的长度表述 有N个样本每个样本的长度是T
input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
#目标的长度表示，有N个样本，每个样本根据实际情况确定长度是多少，为了截断被padding的目标序列
target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)
ctc=nn.CTCLoss()
ctc(input,target,input_lengths,target_lengths)

tensor(6.2789, grad_fn=<MeanBackward0>)

10.循环神经网络

1.RNN

主要说一下RNN的使用规则注意事项和他的变形，由于RNN的结构我们在计算过程中为了保证可以实现批量计算需要将样本个数的维度向后移动。在RNN中有步长额概念要区分开步长和批次。避开卷积的影响。先简单说一下RNN，所谓RNN就是有前后关系的模型。
$h_t = \text{tanh}(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})$ $y_t=W_{yh}*h_t$ 权值共享体现在每个步长数据下都对应相同的$W_{ih},W_{hh},W_{yh}$ 也就是说有多少个RNN快都没关系都对象相同的权值。

import torch.nn as nn
import torch
a=torch.rand(2,1,10)#这里是一个样本，每个样本的长度是2，一个步长对象的特征10维数据，比如一句话，有2个字每个字用向量表示维一个10维向量
Rnn=nn.RNN(10,3,1)
Rnn(a)#返回值有两个第一个是输出第二个是中间量

(tensor([[[-0.8076,  0.7770,  0.8194]],
 
         [[-0.8560,  0.9246,  0.7603]]], grad_fn=<StackBackward>),
 tensor([[[-0.8560,  0.9246,  0.7603]]], grad_fn=<StackBackward>))

#来模拟一下计算过程
wh=Rnn.weight_hh_l0
wi=Rnn.weight_ih_l0
bi=Rnn.bias_ih_l0
bh=Rnn.bias_hh_l0
# 可以明确的是该RNN模型输出大小是2个步长，每个步长输出大小是3维

ht=torch.tanh(torch.matmul(a[0],wi.t())+bi+bh) #第一步输出，也是ht 默认第一步没有h0 所以用0代替h0
torch.tanh(torch.matmul(a[1],wi.t())+bi+torch.matmul(ht,wh.t())+bh)#第二步的输出可以看到同上一样

tensor([[-0.2034, -0.7251,  0.2122]], grad_fn=<TanhBackward>)

#对于多层的RNN模拟一下
 #对于两层的RNN权值共享只在横向上进行纵向不共享。所以参数会翻倍
a=torch.rand(2,1,10) #有一句话，这句话有两个字，每个字用10维向量表示
Rnn=nn.RNN(10,3,3) 
Rnn(a)

(tensor([[[-0.7234, -0.3822,  0.2843]],
 
         [[-0.4397, -0.0940,  0.7035]]], grad_fn=<StackBackward>),
 tensor([[[ 0.4449,  0.5662,  0.2119]],
 
         [[ 0.6159,  0.1632,  0.6264]],
 
         [[-0.4397, -0.0940,  0.7035]]], grad_fn=<StackBackward>))

Rnn.weight_hh_l2 #之所以是3*3是因为 为了匹配输出要固定好每层的输出最后的输出与前一层的输出相同

Parameter containing:
tensor([[-0.3290, -0.3179, -0.1498],
        [-0.3062, -0.2480, -0.1324],
        [-0.4643, -0.2216,  0.1468]], requires_grad=True)

2.LSTM

与RNN差不多，只不过记过过程不太一样会，会多一个控制量。同时还增加了一个双向的概念 $\begin{array}{ll} \\ i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\ f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\ g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\ o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\ c_t = f_t * c_{(t-1)} + i_t * g_t \\ h_t = o_t * \tanh(c_t) \\ \end{array}$

import torch.nn as nn
import torch
a=torch.rand(2,1,10) #一句话每句话两个词一个字用10维向量表示
Lstm=nn.LSTM(10,3,1)
Lstm(a) #输出有三个第一个是我们需要的输出第二个是H第三个是C

(tensor([[[ 0.1166, -0.1578, -0.0591]],
 
         [[ 0.1272, -0.2776, -0.0414]]], grad_fn=<StackBackward>),
 (tensor([[[ 0.1272, -0.2776, -0.0414]]], grad_fn=<StackBackward>),
  tensor([[[ 0.5805, -0.6388, -0.2383]]], grad_fn=<StackBackward>)))

import torch.nn as nn
import torch
a=torch.rand(2,1,10) 
Lstm=nn.LSTM(10,3,1,bidirectional=True) #当设置成双向的时候输出会翻倍参数会翻倍 输出会自动整合双向输出
Lstm(a) #输出有三个第一个是我们需要的输出第二个是H第三个是C

(tensor([[[ 0.1165, -0.0157, -0.1615, -0.0108,  0.0372, -0.0604]],
 
         [[ 0.2226, -0.0470, -0.2941, -0.0484,  0.0340,  0.1166]]],
        grad_fn=<CatBackward>),
 (tensor([[[ 0.2226, -0.0470, -0.2941]],
  
          [[-0.0108,  0.0372, -0.0604]]], grad_fn=<StackBackward>),
  tensor([[[ 0.3341, -0.1073, -0.5567]],
  
          [[-0.0148,  0.1525, -0.1107]]], grad_fn=<StackBackward>)))

11.空间映射Embedding

1.Embedding

常用在稀疏数据处理上，one-hot映射或者索引映射。其实具体原理很简单。就是随机构建个词典大小我们定义长度的矩阵，然后根据带映射数据的值去索引那个矩阵然后表示

#当训练发生时，这部分的数据才正真会产生作用
a=torch.randint(0,4,(2,3)) #解释一下 加入有两句话一句话有三个词用在词典中的位置索引表示，这个词典共有4个词
emb=nn.Embedding(4,3)
emb(a) #这样我就就将词转化为了可以输入到LSTM的数据，变成两句话。每句话是三个词，一个词由一个三维向量构成

tensor([[[ 0.8193, -0.6931,  0.4941],
         [ 0.3312,  1.3720, -0.8186],
         [ 0.8193, -0.6931,  0.4941]],

        [[ 0.0557,  0.3277, -0.6639],
         [ 0.3312,  1.3720, -0.8186],
         [-1.9667,  0.0058,  0.9042]]], grad_fn=<EmbeddingBackward>)

emb.weight

Parameter containing:
tensor([[ 0.0557,  0.3277, -0.6639],
        [-1.9667,  0.0058,  0.9042],
        [ 0.3312,  1.3720, -0.8186],
        [ 0.8193, -0.6931,  0.4941]], requires_grad=True)

#如果一句话有的是两个词有的是三个词我们需要将他们形状统一，也就是padding。如果一句话
#两个词每个词都是三维的会的带2*3矩阵，但是我们的步长要求是3所以需要padding 
a=torch.randint(0,4,(2,3))
emb=nn.Embedding(4,3,padding_idx=0) #加上这个参数可以设置经过emb的数据padding, 
emb(a)

tensor([[[ 1.6454,  1.1428, -1.0299],
         [ 1.4701, -0.0240,  0.4981],
         [ 1.6454,  1.1428, -1.0299]],

        [[ 0.0000,  0.0000,  0.0000],
         [ 0.8171,  0.0683, -0.1251],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward>)

#使用的时候就是用某个值填充长度。
b=torch.LongTensor([2,0,0])#这个表示一句话只有一个字但是我们需要的是3
emb(b)

tensor([[-0.0061, -1.8805,  0.3787],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000]], grad_fn=<EmbeddingBackward>)

2.EmbeddingBag

这个与之前的有类似的地方但是有进一步计算的过程

import torch.nn as nn
import torch
a=torch.LongTensor([[1,2,3],[1,2,3]]) #两句话一句话3个词，每个词都是所在词典位置索引
embd=nn.EmbeddingBag(5,3) #相当于Emdedding后再进行mean(dim=0) 默认是mean计算我们可以是设置模式mode为sum
embd(a)

tensor([[0.9114, 0.0372, 1.2134],
        [0.9114, 0.0372, 1.2134]], grad_fn=<EmbeddingBagBackward>)

#实际上就是这个操作
embd.weight[1:-1,:].mean(dim=0)

tensor([0.9114, 0.0372, 1.2134], grad_fn=<MeanBackward1>)

#当输入是一维数据时我们需要设定一个新参数。
a=torch.LongTensor([1,2,3,1,2,3]) #
embd=nn.EmbeddingBag(5,3) #
#这里的offsets矩阵必须是一个向量结果就是这个向量的长度，结果如何计算呢，第一个结果从待映射向量a的0位置开始
#到2位置之前也就是[1,2] 这段向量做embedding 然后求mean。然后第二个向量从2位置开始到最后所有[3,1,2,3]
#做embedding然后求mean
embd(a,offsets=torch.tensor([0,2]))

tensor([[-1.2640, -0.3483, -1.0288],
        [-1.0475, -0.7013, -0.5141]], grad_fn=<EmbeddingBagBackward>)

#对应上述第一个映射向量
(embd.weight[1,:]+embd.weight[2,:])/2

tensor([-1.2640, -0.3483, -1.0288], grad_fn=<DivBackward0>)

#对应上面第二个向量
(embd.weight[3,:]+embd.weight[1,:]+embd.weight[2,:]+embd.weight[3,:])/4

tensor([-1.0475, -0.7013, -0.5141], grad_fn=<DivBackward0>)

12.module 类

这个类是所有模型的父类，看看他都有那些方法，同时可以看到所有继承这个父类的子类都有哪些功能。首先这个类继承了python的最基本类这个类包含了一些常用的特殊方法。再一定条件下才可以使用。这个类主要是pytorch的最基础的模型父类，当需要构建一个模型的时候要继承这个类。其次我们再进行数学运算的时候也需要继承这个类。

#构建一个模型
import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.conv=nn.Conv2d(3,16,3,1,1) #一个输出大小一样的卷积
        self.Relu=nn.LeakyReLU(0.2) #一个激活
        self.Linear=nn.Linear(1600,2)#一个全连接
        self.Softmax=nn.Softmax(dim=1)  #一个分类维度1表示 在列上分类
        self.Batchnorma=nn.BatchNorm2d(16) #一个标准化
        self.Dropout=nn.Dropout(0.5) #一个正则
    def forward(self,x):
        return self.Softmax(self.Linear(self.Dropout(self.Relu(self.Batchnorma(self.conv(x)))).view(2,-1)))
mode_=mode()
a=torch.rand(2,3,10,10)#两个样本的数据
mode_(a)

tensor([[0.2754, 0.7246],
        [0.3598, 0.6402]], grad_fn=<SoftmaxBackward>)

上面模型可以看到pytorch构建模型的大概基础。这其中还有擦书初始化和增加模块的表示下面的函数会涉及到。

1.初始化函数`init()`

这个函数主要是存储参数变量，包括模型权重模型的初始化数据等等所以我们在构建自己的模型的时候必须继承这个类中的所有变量

#来看看这几个变量字典的存储内容
#构建一个模型
import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(3,3)
        self.L2=nn.Linear(3,1)
    def forward(self,x):
        return self.L2(self.L1(x))
mode_=mode()
print(mode_._parameters) #参数字典
print(mode_._buffers)  #不需要更新参数字典
print(mode_._backward_hooks)
print(mode_._forward_hooks)
print(mode_._forward_pre_hooks)
print(mode_._state_dict_hooks)
print(mode_._load_state_dict_pre_hooks)
print(mode_._modules) #模型字典

OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict([('L1', Linear(in_features=3, out_features=3, bias=True)), ('L2', Linear(in_features=3, out_features=1, bias=True))])

2.前向传播forward()

这个函数我们需要保证必须重写，在这里我们需要定义我们构建的模型的前向传播计算过程，我们的算法如何工作是这个模型定义的当一条数据过来我们需要对数据做如何操作从这个函数开始。这个函数的工作流程会被__call__()函数调用。当我们将数据传入模型的时候首先会传入__call__()函数这个函数会调用forward()函数。中间过程会经过两次钩子函数的互相调用。__call__（）函数是python的内置函数，他可以实现的就是不需要加函数名就可以调用函数。说实话除了让看人看的简洁没什么卵用。

## 演示一下forward 这个就是Module中forward的大概实现方法
class Module(object):
    def __init__(self,w=1):
        self.w=w
        self.b=0
    def forward(self ,x):
        return x*self.w+self.b
    def __call__(self,x):
        return self.forward(x)
mode=Module(2)
mode(2)

3.缓冲区数据register_buffer()

这个函数主要是用来保存不需要被优化器更新的数据。我们可以在forward中进行更新。比如标准化的均值。标准化在计算的过程中首先需要计算数据的均值。当我们在进行预测的时候我们任然需要取到均值，但是此时的数据是一张样本。我们需要找到在训练过程中每批次数据的均值的均值，进行计算这样会得到测试数据的标准化均值，这个数据我们不能进行更新，所以要存在某个固定的缓存中。当我们load模型时这部分数据也会被load。

4.参数缓冲区register_parameter()

这个函数可以帮助我们将待训练的参数加入训练。这部分数据会加入到我们在模型初始过中构建的参数字典中，当模型更新的时候取数据会从这个字典中取

5.增加继承模型add_module()

这个函数可以让我自己随便增加模型。

#构建一个模型
import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
    def forward(self,x):
        return self.L2(self.L1(x))
mode_=mode()
a=torch.rand(2,10)
mode_(a)
mode_.add_module("L3",nn.Linear(10,10)) #此时会想有序的参数字典中传入这个子模型

mode_._modules

OrderedDict([('L1', Linear(in_features=10, out_features=10, bias=True)),
             ('L2', Linear(in_features=10, out_features=1, bias=True)),
             ('L3', Linear(in_features=10, out_features=10, bias=True))])

6.apply

这个是对我们构建的模型中所有小模型的操作。比如我们可以使用这个函数来批量更新我们模型的参数。它的工作原理是首先要将模型所有子模块便利出来，然后针对子模块用用我们的fn函数
他的实现是非常精妙的三行代码

    def apply(self, fn):
        for module in self.children():
            module.apply(fn)
        fn(self)
        return self

import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
    def forward(self,x):
        return self.L2(self.L1(x))
mode_=mode()
def fn(x):
    print(x,12)
    if isinstance(x,nn.Linear):
        x.weight.data.fill_(1)
        x.bias.data.fill_(0)
mode_.apply(fn)
list(mode_.parameters()) #可以看到所有参数都变为1和0了 需要注意的是apply函数是对最小的子模型做处理的

Linear(in_features=10, out_features=10, bias=True) 12
Linear(in_features=10, out_features=1, bias=True) 12
mode(
  (L1): Linear(in_features=10, out_features=10, bias=True)
  (L2): Linear(in_features=10, out_features=1, bias=True)
) 12





[Parameter containing:
 tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], requires_grad=True),
 Parameter containing:
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True),
 Parameter containing:
 tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], requires_grad=True),
 Parameter containing:
 tensor([0.], requires_grad=True)]

7.cuda()

这个函数就是想模型转化为GPU模型。他的实现方法就是调用_apply,传入的参数是设备名称。本质是将模型中的参数放到GPU上而且是个inplace操作

import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
    def forward(self,x):
        return self.L2(self.L1(x))
mode_=mode()
mode_.cuda()
list(mode_.parameters()) #可以看到参数数据都是在GPU0上

8.cpu()

将模型放入cpu 中也就是将模型参数数据放到cpu中

9.type()

将模型子模型的参数全部变为某个类型

import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
    def forward(self,x):
        return self.L2(self.L1(x))
mode_=mode()
mode_.type(torch.int8)
mode_.L1.weight

Parameter containing:
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=torch.int8, requires_grad=True)

10.float()

将模型中子模型参数变化为float类型。我们知道精度越高运行效率也就越低

11.to()

这个是一个大方法。可以做类型转化和设备迁移。和cuda() cpu()相似

import torch.nn as nn
L=nn.Linear(10,1)
L.to(0) #直接将模型送到GPU0上
L.to(torch.float16) #将模型参数变为浮点16
L.to(device="cpu") #将模型变为CPU数据
L.to('cuda') #gpu0数据  

Linear(in_features=10, out_features=1, bias=True)

12.setattr() getattr()

有这两个方法在我们可以使用变量明访问模型中的子模型

import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
    def forward(self,x):
        return self.L2(self.L1(x))
mode_=mode()
mode_.L1

Linear(in_features=10, out_features=10, bias=True)

13.模型参数导出state_dict()

这个函数可以返回模型的所有参数。我们在保存模型的时候使用这个可以只保存模型参数

import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
        self.B=nn.BatchNorm1d(2)
    def forward(self,x):
        return self.L2(self.L1(x))
mode_=mode()
mode_.state_dict() #可以看到模型中的参数包括权值偏置，还有缓冲区数据都会被返回

14.导入参数load_state_dict()

这个函数可以导入我们的模型参数，对模型进行赋值，前提是我们首先需要构建一个模型 ,这里需要注意，我们保存的模型参数是个字典，有key和value所以我们在load的时候要保证key一样，value大小一样

import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
        self.B=nn.BatchNorm1d(2)
    def forward(self,x):
        return self.L2(self.L1(x))
mode_=mode()
data=mode_.state_dict()
mode_.load_state_dict(data)

<All keys matched successfully>

15.参数迭代器parameters()

可以返回一个参数生成器。这些参数会被作为待更新的参数传入优化器。

import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
        self.B=nn.BatchNorm1d(2)
    def forward(self,x):
        return self.L2(self.L1(x))
mode_=mode()
mode_.parameters()

16.运行模型转化。train() 和eval()

主要是用来切换我们的模型是训练还是测试，其实这两种操作后不会有明显的变化。变化主要体现在dropout和标准化上。当是训练模式的时候dropout正常工作，测试的时候不工作，当训练时在标准化上我们需要计算每批数据的均值，测试的时候我们使用训练集的平均均值

import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
        self.D=nn.Dropout(0.5)
    def forward(self,x):
        return self.L2(self.D(self.L1(x)))
a=torch.rand(2,10)
mode_=mode()
mode_.train()
print(mode_(a))
mode_.eval()
print(mode_(a)) #可以看到很明显的区别

tensor([[ 0.1032],
        [-0.3140]], grad_fn=<AddmmBackward>)
tensor([[-0.1938],
        [-0.2405]], grad_fn=<AddmmBackward>)

17.梯度清空 zero_grad()

在我们需要更新梯度的时候不能让梯度累积所以每次根性都需要清空上一次梯度数据这个清空和优化器情况是一样的

import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
        self.D=nn.Dropout(0.5)
    def forward(self,x):
        return self.L2(self.D(self.L1(x)))
a=torch.rand(2,10)
mode_=mode()
mode_.zero_grad()

13.并行计算parallel

实现多GPU计算可以加快速度增大batch-size容量挺快的。看一下他是如何实现的，这个对象同样是继承自Module
来分析一下这个类的实现方法。

class DataParallel(Module):
    def __init__(self, module, device_ids=None, output_device=None, dim=0):
        super(DataParallel, self).__init__()

        if not torch.cuda.is_available(): #判断是否有GPU可用，如果没有就返回我们模型本身
            self.module = module
            self.device_ids = []
            return

        if device_ids is None:
            device_ids = list(range(torch.cuda.device_count())) #判断可用GPU个数
        if output_device is None:
            output_device = device_ids[0]

        self.dim = dim
        self.module = module
        self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
        self.output_device = _get_device_index(output_device, True)
        self.src_device_obj = torch.device("cuda:{}".format(self.device_ids[0]))

        _check_balance(self.device_ids)

        if len(self.device_ids) == 1:
            self.module.cuda(device_ids[0])
       
    def forward(self, *inputs, **kwargs):#主要看一下前向传播
        if not self.device_ids: #如果设备没有就直接调用原模型，相当于没有进行任何操作
            return self.module(*inputs, **kwargs)

        for t in chain(self.module.parameters(), self.module.buffers()):#这里要求我们的模型数据首先放在GPU0上
            if t.device != self.src_device_obj:
                raise RuntimeError("module must have its parameters and buffers "
                                   "on device {} (device_ids[0]) but found one of "
                                   "them on device: {}".format(self.src_device_obj, t.device))

        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) #切分输出的数据假如输入120条数据如果有4个GPU可用则会切分成4个30条的数据
        if len(self.device_ids) == 1:
            return self.module(*inputs[0], **kwargs[0])
        replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) #将模型复制4份
        outputs = self.parallel_apply(replicas, inputs, kwargs) 进行线程计算返回所有结果到GPU0上
        #这个线程的操作主要是threading实现，首先每个模型是一个函数，平分的数据作为函数参数，然后
        #for循环启动线程，等带最后执行完的线程回收结果
        return self.gather(outputs, self.output_device) #处理结果

import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(10,10)
        self.L2=nn.Linear(10,1)
        self.D=nn.Dropout(0.5)
    def forward(self,x):
        return self.L2(self.D(self.L1(x)))
a=torch.rand(2,10)
mode_=mode()
mode_.cuda(0) #首先需要将模型装载到GPU0上
mode_p=nn.DataParallel(mode_,device_ids=[0])  #可以设置使用的GPU都是几号
mode_p(a.cuda(0))  #也需要将数据放到GPU0上，系统会自动对数据进行切分，分发然后回收

tensor([[0.1459],
        [0.1607]], device='cuda:0', grad_fn=<AddmmBackward>)

21.optim

这节主要分析一下所有的优化方法和实现过程以及使用方法。从最基本最高效也是最常用的方法。去年有论文已经证明了带有动量的SGD是目前最优秀的更新算法其他都是花里胡哨。
在公式中，v表动量也就是上一轮我们的动量结果，当第一步的时候v0=当前梯度值，g表示梯度，p表示带更新参数。我们可以看到在动量系数的基础上还会乘学习率， $v_{t+1} = \mu * v_{t} + g_{t+1} \\ p_{t+1} = p_{t} - lr * v_{t+1}$

优化器要做的事情是将梯度以一定方式回传也就是更新参数。我们需要做的就是设定好这个回传方式。看一下它的源码

#一个SGD优化器的对象。
import torch
from .optimizer import Optimizer, required
class SGD(Optimizer):

    def __init__(self, params, lr=required, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False): 
        if lr is not required and lr < 0.0:  #要求学习率不能为负
            raise ValueError("Invalid learning rate: {}".format(lr)) 
        if momentum < 0.0:  #动量不能为负
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:#正则化衰减稀疏
            raise ValueError("Invalid  weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening, #定义一个参数字典
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super(SGD, self).__init__(params, defaults) #主要操作时把所有参数都对对应好
        #所谓对应好是说，当设置不同层具有不同学习方式时要具有对应关系，

    def __setstate__(self, state):
        super(SGD, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):  #执行更新
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups: #加入我们我们的网络都使用一种学习率时这里只遍历一次
            weight_decay = group['weight_decay'] #同一种学习率下的参数
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']: #要遍历所有带更新的参数
                if p.grad is None: #求得参数的导数 如果导数不存在，也就是当某些参数冻结的时候我们跳过这些参数
                    continue
                d_p = p.grad.data 
                if weight_decay != 0:
                    d_p = d_p.add(weight_decay, p.data) #所有参数都乘一个正则系数
                if momentum != 0:
                    param_state = self.state[p] #这个主要是用来记录上一次更新信息
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()，#当从第0次开始
                        #更新时我们使用第0次求得的导数。理论上需要的是上一次的导数。
                    else:
                        buf = param_state['momentum_buffer']
                        #具体实现是，先找到上次动量buf，也就是上次梯度，乘动量系数，加得到当前梯度值
                        buf.mul_(momentum).add_(1 - dampening, d_p)#计算当前的加上动量的梯度
                        
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf
                                
                p.data.add_(-group['lr'], d_p) #用当前d_p也就是加完动量的梯度乘学习率，更新所有的参数

        return loss

#来具体看看代码执行的中间结果。
import torch
import torch.nn as nn
class mode(nn.Module):
    def __init__(self):
        super(mode,self).__init__() #这步必须有需要去加载父类中的变量存储字典
        self.L1=nn.Linear(3,2)
        self.L2=nn.Linear(2,1)
        
    def forward(self,x):
        return self.L2(self.L1(x))
a=torch.rand(1,3)
#模型包含两乘全连接
# y=w2(w1*x+b1)+b2
#w1的导数=w2*x
#w2的导数=w1*x+b1
mode_=mode()

print("未更新前的参数:{}".format(list(mode_.parameters()))) #未更新时的参数值
mode_.zero_grad() #先将默认梯度清零
opt=torch.optim.SGD(mode_.parameters(),lr=2)
out=mode_(a)
out.backward() #这里需要注意我们的输出只能是一个值也就是灭有雅可比矩阵，当输出有多个的时候会需要设置雅可比矩阵的求和系数
print("梯度值：{}".format(list(opt.param_groups[0]['params'][0].grad)))
print("梯度值：{}".format(list(opt.param_groups[0]['params'][1].grad)))
print("梯度值：{}".format(list(opt.param_groups[0]['params'][2].grad)))
print("梯度值：{}".format(list(opt.param_groups[0]['params'][3].grad)))
opt.step()
print("更新前的参数:{}".format(list(mode_.parameters()))) #更新后的参数值

未更新前的参数:[Parameter containing:
tensor([[-0.4261,  0.5747,  0.5451],
        [-0.4789, -0.0994,  0.4064]], requires_grad=True), Parameter containing:
tensor([ 0.2287, -0.3438], requires_grad=True), Parameter containing:
tensor([[-0.1817, -0.5562]], requires_grad=True), Parameter containing:
tensor([0.5979], requires_grad=True)]
梯度值：[tensor([-0.0977, -0.1720, -0.0832]), tensor([-0.2989, -0.5264, -0.2548])]
梯度值：[tensor(-0.1817), tensor(-0.5562)]
梯度值：[tensor([ 0.7933, -0.5091])]
梯度值：[tensor(1.)]
更新前的参数:[Parameter containing:
tensor([[-0.2307,  0.9187,  0.7116],
        [ 0.1189,  0.9533,  0.9160]], requires_grad=True), Parameter containing:
tensor([0.5922, 0.7685], requires_grad=True), Parameter containing:
tensor([[-1.7684,  0.4620]], requires_grad=True), Parameter containing:
tensor([-1.4021], requires_grad=True)]

#上面的实现过程 更新后参数=原始参数-lr*梯度
#针对w1的更新
-torch.mm(torch.tensor([[-0.1817, -0.5562]]).t(),a)*2+torch.tensor([[-0.4261,  0.5747,  0.5451],
        [-0.4789, -0.0994,  0.4064]])

tensor([[-0.2308,  0.9186,  0.7116],
        [ 0.1189,  0.9534,  0.9160]])

22.utils

这个主要是对数据进行操作，生成训练数据将数据装入内存等等，其中会涉及很多关于内存关系和线程的东西.
在处理数据的时候希望数据可以批量输入到某型中，这个批量可以交给他来完成，只要按格式对应写好就可以。
最重要的data模块，则个模块主要是实现将数据封装成一个生成器让模型可以批量读取。这个源码有很多需要去理解，自带的collate_fn函数，随机打乱数据的算法，数据在内存中的操作

#看一下data包下的源代码
#这个类是将数据封装成一个对象，我们可以一个一个的取其中的元素，同时我们也可以为它增加数据。
class Dataset(object): #

    def __getitem__(self, index):#可以看到我们在使用这个对象的时候必须要重写这个方法，
        raise NotImplementedError

    def __add__(self, other):
        return ConcatDataset([self, other])

#看一下DataLoader这个类是如何实现数据的封装，
#当调用这个类后会得到一个迭代器，下面有__iter__这个函数。所以训练遍历的时候首先会执行__iter__ 
#在看这部分源码的时候是分厂混乱的，调用关系很最终会以生成器的方式返回一个batch的样本，这里首先要明确一个
#python函数的使用方法。__iter__和__next__
class DataLoader(object):
    __initialized = False
    #初始化参数第一个是dataset对象，batchsize默认大小为1 是否打乱数据 
    #collate_fn这个参数表示我们以何种方式处理输入的数据，也就是生成我们想要大小的数据，在RNN中会经常用到这个函数
    #这个函数的定义需要我们更具实际情况自己定义
    def __init__(self, dataset, batch_size=1, shuffle=False, sampler=None,
                 batch_sampler=None, num_workers=0, collate_fn=None,
                 pin_memory=False, drop_last=False, timeout=0,
                 worker_init_fn=None, multiprocessing_context=None):
        torch._C._log_api_usage_once("python.data_loader")

        if num_workers < 0: # 这块是多任务处理设置的线程个数
            raise ValueError('num_workers option should be non-negative; '
                             'use num_workers=0 to disable multiprocessing.')

        if timeout < 0:
            raise ValueError('timeout option should be non-negative')

        self.dataset = dataset
        self.num_workers = num_workers
        self.pin_memory = pin_memory  #表示锁业没存，也就是我们cpu正真的内存，其他都是虚拟出来的
        self.timeout = timeout
        self.worker_init_fn = worker_init_fn
        self.multiprocessing_context = multiprocessing_context

        # Arg-check dataset related before checking samplers because we want to
        # tell users that iterable-style datasets are incompatible with custom
        # samplers first, so that they don't learn that this combo doesn't work
        # after spending time fixing the custom sampler errors.
        if isinstance(dataset, IterableDataset):
            self._dataset_kind = _DatasetKind.Iterable

            if shuffle is not False:
                raise ValueError(
                    "DataLoader with IterableDataset: expected unspecified "
                    "shuffle option, but got shuffle={}".format(shuffle))
            elif sampler is not None:
                # See NOTE [ Custom Samplers and IterableDataset ]
                raise ValueError(
                    "DataLoader with IterableDataset: expected unspecified "
                    "sampler option, but got sampler={}".format(sampler))
            elif batch_sampler is not None:
                # See NOTE [ Custom Samplers and IterableDataset ]
                raise ValueError(
                    "DataLoader with IterableDataset: expected unspecified "
                    "batch_sampler option, but got batch_sampler={}".format(batch_sampler))
        else:
            self._dataset_kind = _DatasetKind.Map

        if sampler is not None and shuffle:
            raise ValueError('sampler option is mutually exclusive with '
                             'shuffle')

        if batch_sampler is not None:
            # auto_collation with custom batch_sampler
            if batch_size != 1 or shuffle or sampler is not None or drop_last:
                raise ValueError('batch_sampler option is mutually exclusive '
                                 'with batch_size, shuffle, sampler, and '
                                 'drop_last')
            batch_size = None
            drop_last = False
        elif batch_size is None:
            # no auto_collation
            if shuffle or drop_last:
                raise ValueError('batch_size=None option disables auto-batching '
                                 'and is mutually exclusive with '
                                 'shuffle, and drop_last')

        if sampler is None:  # give default samplers
            if self._dataset_kind == _DatasetKind.Iterable:
                # See NOTE [ Custom Samplers and IterableDataset ]
                sampler = _InfiniteConstantSampler()
            else:  # map-style
                if shuffle:
                    sampler = RandomSampler(dataset)
                else:
                    sampler = SequentialSampler(dataset)

        if batch_size is not None and batch_sampler is None:
            # auto_collation without custom batch_sampler
            batch_sampler = BatchSampler(sampler, batch_size, drop_last)

        self.batch_size = batch_size
        self.drop_last = drop_last
        self.sampler = sampler
        self.batch_sampler = batch_sampler

        if collate_fn is None:
            if self._auto_collation:
                collate_fn = _utils.collate.default_collate
            else:
                collate_fn = _utils.collate.default_convert

        self.collate_fn = collate_fn
        self.__initialized = True
        self._IterableDataset_len_called = None  # See NOTE [ IterableDataset and __len__ ]

    @property
    def multiprocessing_context(self):
        return self.__multiprocessing_context

    @multiprocessing_context.setter
    def multiprocessing_context(self, multiprocessing_context):
        if multiprocessing_context is not None:
            if self.num_workers > 0:
                if not multiprocessing._supports_context:
                    raise ValueError('multiprocessing_context relies on Python >= 3.4, with '
                                     'support for different start methods')

                if isinstance(multiprocessing_context, string_classes):
                    valid_start_methods = multiprocessing.get_all_start_methods()
                    if multiprocessing_context not in valid_start_methods:
                        raise ValueError(
                            ('multiprocessing_context option '
                             'should specify a valid start method in {}, but got '
                             'multiprocessing_context={}').format(valid_start_methods, multiprocessing_context))
                    multiprocessing_context = multiprocessing.get_context(multiprocessing_context)

                if not isinstance(multiprocessing_context, python_multiprocessing.context.BaseContext):
                    raise ValueError(('multiprocessing_context option should be a valid context '
                                      'object or a string specifying the start method, but got '
                                      'multiprocessing_context={}').format(multiprocessing_context))
            else:
                raise ValueError(('multiprocessing_context can only be used with '
                                  'multi-process loading (num_workers > 0), but got '
                                  'num_workers={}').format(self.num_workers))

        self.__multiprocessing_context = multiprocessing_context

    def __setattr__(self, attr, val):
        if self.__initialized and attr in ('batch_size', 'batch_sampler', 'sampler', 'drop_last', 'dataset'):
            raise ValueError('{} attribute should not be set after {} is '
                             'initialized'.format(attr, self.__class__.__name__))

        super(DataLoader, self).__setattr__(attr, val)

    def __iter__(self):  #看到这个就知道这是个迭代器
        if self.num_workers == 0:
            return _SingleProcessDataLoaderIter(self)
        else:
            return _MultiProcessingDataLoaderIter(self)

    @property
    def _auto_collation(self):
        return self.batch_sampler is not None

    @property
    def _index_sampler(self):
        # The actual sampler used for generating indices for `_DatasetFetcher`
        # (see _utils/fetch.py) to read data at each time. This would be
        # `.batch_sampler` if in auto-collation mode, and `.sampler` otherwise.
        # We can't change `.sampler` and `.batch_sampler` attributes for BC
        # reasons.
        if self._auto_collation:
            return self.batch_sampler
        else:
            return self.sampler

    def __len__(self):
        if self._dataset_kind == _DatasetKind.Iterable:
            # NOTE [ IterableDataset and __len__ ]
            #
            # For `IterableDataset`, `__len__` could be inaccurate when one naively
            # does multi-processing data loading, since the samples will be duplicated.
            # However, no real use case should be actually using that behavior, so
            # it should count as a user error. We should generally trust user
            # code to do the proper thing (e.g., configure each replica differently
            # in `__iter__`), and give us the correct `__len__` if they choose to
            # implement it (this will still throw if the dataset does not implement
            # a `__len__`).
            #
            # To provide a further warning, we track if `__len__` was called on the
            # `DataLoader`, save the returned value in `self._len_called`, and warn
            # if the iterator ends up yielding more than this number of samples.
            length = self._IterableDataset_len_called = len(self.dataset)
            return length
        else:
            return len(self._index_sampler)

class test():
    def __init__(self,x):
        self.a=0
        self.x=x
    def __iter__(self): #需要返回一个有__next__()函数的对象，本身包含就返回本身
        return self
    def __next__(self): #迭代过程，每次都从该函数开始执行
        if self.a<self.x:
            self.a=self.a+1
            return self.a
        else:
            raise StopIteration    #设定停止条件 for会自动捕获并停止    
list(test(4))  #迭代器的好处就是不用存储所有的数据
    

[1, 2, 3, 4]

#样例使用规范，
import torch 
import torch.nn as nn
a=torch.rand(10,3,128,128)
b=torch.randint(0,5,(10,))
class data(torch.utils.data.Dataset):
    def __init__(self):
        self.a=a
        self.b=b
    def __getitem__(self,ind):
        return self.a[ind],self.b[ind]
    def __len__(self):
        return len(self.a)
data_set=data()
data_train=torch.utils.data.DataLoader(data_set,2,shuffle=True)

for i in data_train:
    print(i[0].shape)
    print(i[1])

torch.Size([2, 3, 128, 128])
tensor([4, 4])
torch.Size([2, 3, 128, 128])
tensor([0, 4])
torch.Size([2, 3, 128, 128])
tensor([1, 3])
torch.Size([2, 3, 128, 128])
tensor([4, 1])
torch.Size([2, 3, 128, 128])
tensor([0, 4])

a=torch.rand(4,3)
b=torch.randint(0,2,(4,))
class test1(torch.utils.data.DataLoader):
    def __init__(self):
        self.a=a
        self.b=b
    def __getitem__(self,ind):
        return self.a[ind],self.b[ind]
    def __len__(self):
        return len(self.a)
data_set=test1()
data_train=torch.utils.data.DataLoader(data_set,batch_size=2,shuffle=False)

[tensor([[0.0398, 0.6395, 0.8758],
        [0.9570, 0.4344, 0.4971]]), tensor([0, 0])]
[tensor([[0.5369, 0.7944, 0.4060],
        [0.0810, 0.5820, 0.7977]]), tensor([1, 1])]