思路与公式

记忆单元数量（隐藏维度）为 $m=5$ ，输入维度为 $x\_dim=7$ ，因此每个门的权重矩阵形状均为 $(m,\; x\_dim+m)=(5,12)$ 。在每个时间步 $t$ ，把当前输入 $x_t\in\mathbb{R}^{7}$ 与上一步隐藏向量 $h_{t-1}\in\mathbb{R}^{5}$ 级联成 $\mathrm{xc}=[x_t;h_{t-1}]\in\mathbb{R}^{12}$ 。
经典 LSTM 的前向计算如下（与题面描述一致）：
$$\begin{aligned} g_t &= \tanh(W_g \, \mathrm{xc} + b_g),\\ i_t &= \sigma(W_i \, \mathrm{xc} + b_i),\\ f_t &= \sigma(W_f \, \mathrm{xc} + b_f),\\ o_t &= \sigma(W_o \, \mathrm{xc} + b_o),\\ s_t &= g_t \odot i_t + s_{t-1} \odot f_t,\\ h_t &= \tanh(s_t) \odot o_t, \end{aligned}$$
其中 $\sigma(\cdot)$ 为 Sigmoid， $\tanh(\cdot)$ 为双曲正切， $\odot$ 为按元素乘法；初始 $s_0=\mathbf{0},\,h_0=\mathbf{0}$ 。
输出：对每个时间步的 $h_t$ 取首元素 $h_t[0]$ ，四舍五入到 3 位小数并去除尾随 0；仅当数值为 0 时输出 0.0。

代码

import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class LetterParam:
    def __init__(self, mem_cell_ct, x_dim):
        self.mem_cell_ct = mem_cell_ct
        self.x_dim = x_dim
        # Weight Matrices Shape(mem_cell_ct, x_dim + mem_cell_ct)
        self.wg = np.array([
            [0.009763, 0.043038, 0.020553, 0.008977, -0.015209, 0.000179, -0.012636, 0.017535, -0.022032, 0.06664, 0.06077, 0.02607],
            [0.013609, 0.095119, -0.085793, -0.095274, -0.099966, 0.086524, 0.056641, 0.074002, 0.06774, -0.00704, 0.00686, -0.01013],
            [-0.076345, 0.027984, -0.071329, 0.088934, 0.00437, -0.017065, -0.047089, 0.046647, -0.06871, 0.00387, -0.00661, -0.01133],
            [0.022419, 0.023837, 0.08375, 0.036964, -0.028098, -0.012594, 0.039626, -0.03785, 0.03383, 0.00188, -0.00723, -0.06378],
            [-0.036914, -0.027258, 0.014093, -0.01228, 0.007675, -0.078691, -0.058225, -0.05922, -0.04942, -0.06378, -0.01133, 0.00689]
        ], dtype=float)
        
        self.wi = np.array([
            [-0.012801, -0.094815, 0.009932, -0.012938, -0.015208, -0.03393, -0.06907, 0.02384, -0.04069, -0.04695, 0.04227, 0.00689],
            [-0.073084, 0.002716, -0.063112, 0.057087, 0.017075, -0.001143, 0.06291, -0.06470, 0.00196, -0.06943, -0.04476, -0.00694],
            [-0.074568, 0.019349, -0.054798, -0.076611, -0.053938, -0.030035, -0.00644, -0.06951, 0.02981, -0.03384, 0.00647, -0.02581],
            [0.058727, 0.016001, -0.06754, 0.04015, 0.09291, 2e-06, 0.077904, -0.031577, 0.01303, -0.01410, -0.01398, 0.05501],
            [0.007121, 0.090749, 0.005842, -0.053581, -0.025732, 0.07017, -0.018745, -0.09456, -0.05945, -0.06571, 0.06871, 0.00411]
        ], dtype=float)
        
        self.wf = np.array([
            [-0.084738, 0.055984, -0.012318, 0.044693, 0.065598, 0.007089, 0.000224, -0.06559, -0.04612, -7.3e-05, 0.03686, 0.06116],
            [-0.023812, -0.086313, -0.042371, 0.081919, -0.057323, -0.009575, 0.086241, -0.05902, 0.03011, 0.00626, -0.05909, 0.00688],
            [0.081826, -0.073366, 0.004863, 0.050082, 0.033803, -0.005449, -0.05630, -0.00347, -0.02532, -0.0456, -0.02682, 0.03758],
            [0.05373, -0.037201, 0.014525, -0.04479, -0.009431, -0.029404, 0.03148, -0.02329, -0.03618, 0.03586, -0.01707, 0.01834],
            [-0.06391, 0.048224, -0.015252, -0.014709, 0.028876, 0.004581, -0.017023, -0.09715, -0.03168, 0.04157, 0.04680, 0.05221]
        ], dtype=float)
        
        self.wo = np.array([
            [0.022434, -0.066186, -0.012788, 0.053852, -0.040935, -0.070167, -0.05504, -0.01525, -0.05224, -0.05249, 0.03916, -0.05965],
            [-0.083761, 0.03392, 0.024249, -0.045149, -0.006756, -0.076326, -0.085208, 0.08015, 0.05878, 0.06814, 0.05341, 0.06931],
            [0.015455, 0.062753, -0.015736, -0.09451, -0.009173, -0.079835, 0.063444, 0.039546, 0.03367, -0.05155, 0.09695, 0.07272],
            [0.023083, -0.002993, -0.015995, 0.04557, -0.035437, -0.019891, -0.039207, 0.08944, 0.08376, 0.06708, -0.09135, 0.06871],
            [0.090088, 0.061318, -0.003744, 0.093352, -0.016804, -0.036232, -0.096711, -0.05294, -0.06852, -0.01468, -0.03823, -0.06719]
        ], dtype=float)

        # bias terms
        self.bg = np.array([-0.017119, -0.010762, -0.01027, -0.075269, -0.065529], dtype=float)
        self.bi = np.array([0.075116, 0.059407, 0.049271, -0.074094, 0.054991], dtype=float)
        self.bf = np.array([0.018351, -0.01307, -0.014564, 0.009966, 0.066618], dtype=float)
        self.bo = np.array([-0.054807, -0.077083, -0.014593, 0.047107, 0.007309], dtype=float)


class LstmState:
    def __init__(self, mem_cell_ct, x_dim):
        self.g = np.zeros(mem_cell_ct, dtype=float)
        self.i = np.zeros(mem_cell_ct, dtype=float)
        self.f = np.zeros(mem_cell_ct, dtype=float)
        self.o = np.zeros(mem_cell_ct, dtype=float)
        self.s = np.zeros(mem_cell_ct, dtype=float)
        self.h = np.zeros(mem_cell_ct, dtype=float)


class LstmNode:
    def __init__(self, lstm_param, lstm_state):
        # store reference to parameters and to activations
        self.state = lstm_state
        self.param = lstm_param
        # non-recurrent input concatenated with recurrent input
        self.xc = None


class LstmNetwork():
    def __init__(self, lstm_param):
        self.lstm_param = lstm_param
        self.lstm_node_list = []
        # input sequence
        self.x_list = []

    def x_list_clear(self):
        self.x_list = []

    def x_list_add(self, x):
        self.x_list.append(x)

    def forward(self):
        mem_cell_ct = self.lstm_param.mem_cell_ct
        h_prev = np.zeros(mem_cell_ct, dtype=float)
        s_prev = np.zeros(mem_cell_ct, dtype=float)
        h_list = []
        
        for t in range(len(self.x_list)):
            x = self.x_list[t]
            xc = np.hstack((x, h_prev))
            
            g = np.tanh(np.dot(self.lstm_param.wg, xc) + self.lstm_param.bg)
            i = sigmoid(np.dot(self.lstm_param.wi, xc) + self.lstm_param.bi)
            f = sigmoid(np.dot(self.lstm_param.wf, xc) + self.lstm_param.bf)
            o = sigmoid(np.dot(self.lstm_param.wo, xc) + self.lstm_param.bo)
            
            s = f * s_prev + i * g
            h = o * np.tanh(s)
            
            h_list.append(h.copy())
            h_prev = h
            s_prev = s
        
        return h_list


def format_float(x):
    s = f"{x:.3f}"
    if '.' in s:
        s = s.rstrip('0').rstrip('.')
    if s == '' or s == '-0':
        return '0.0'
    if '.' not in s:
        s += '.0'
    return s


def func():
    data = list(map(float, input().split()))
    seq_len = int(data[0])
    x_dim = int(data[1])
    vals = data[2:]
    x_list = np.array(vals, dtype=float).reshape(seq_len, x_dim)
    
    mem_cell_ct = 5
    lstm_param = LetterParam(mem_cell_ct, x_dim)
    lstm_net = LstmNetwork(lstm_param)
    
    for i in range(seq_len):
        lstm_net.x_list_add(x_list[i])
    
    h_list = lstm_net.forward()
    first_elems = [h[0] for h in h_list]
    formatted = [format_float(x) for x in first_elems]
    print(' '.join(formatted))


if __name__ == "__main__":
    func()

题目内容

【问题说明】长短期记忆网络（ $Long$ $Short-Term$ $Memory$ , $LSTM$ ）是一种特殊的循环神经网络（ $RNN$ ），旨在解决传统 $RNN$ 中存在的梯度消失和梯度爆炸问题，使其能够有效地学习长期依赖关系。

一个 $LSTM$ 单元（ $Cell$ ）的核心由三个关键的门和一个细胞状态（ $Cell$ $State$ ）组成：

细胞状态 ( $Cell$ $State$ ):这是 $LSTM$ 的“记忆高速公路"，信息沿着这条路径从一个时间步传递到下一个。它的更新是一个简单的线性操作(加法和乘法)，这使得梯度可以更直接地流动，从而避免了梯度消失。

遗忘门 ( $Forget$ $Gate$ ):遗忘门决定从上一时间步的细胞状态中丢弃哪些信息。它通过一个 $Sigmoid$ 激活函数，对上一个隐藏状态和当前输入进行处理，输出一个介于 $0$ 和 $1$ 之间的向量。 $0$ 表示完全遗忘， $1$ 表示完全保留。

输入门（ $Input Gate$ ）：输入门控制新信息写入到细胞状态中。它包含两个部分：

一个 $Sigmoid$ 层，用于决定哪些值需要更新。
一个 $Tanh$ 层，用于创建新的候选细胞状态（ $C$ ~ $t$ ）。

输出门（ $Output$ $Gate$ ）：输出门决定当前时刻的隐藏状态（ $Hidden$ $State$ ）将输出哪些信息。它首先通过一个 $Sigmoid$ 层来决定细胞状态的哪些部分会被输出，然后对当前的细胞状态应用 $Tanh$ 函数，最后两者相乘得到新的隐藏状态。

【任务要求】请根据下图的 $LSTM$ 结构示意图，实现一个 $LSTM$ 模型的关键函数，并按下列要求输出计算结果。

该 $LSTM$ 模型包含了 $5$ 个 $LSTM$ $Cell$ （上图中 $A$ 单元），每个 $LSTM$ $Cell$ 中的权重的定义如下图所示，分别为 $wf, wi, wg, wo$ , 对应的偏置为 $bf, bi, bg, bo$ 。已在 $python$ 代码模板中提供了 $5$ 个 $LSTM$ $Cell$ 的权重和偏置的数据。如果使用非 $Python$ 语言, 需沿用 $Python3$ 代码模板中的参数设置。

该 $LSTM$ 模型会循环地作用于输入序列中的每一个时间步（从 $t=1$ 到 $t=sequence\_length$ ），每个时间步的计算都会产生一个 $5$ 维的隐藏状态。请针对不同输入矩阵运行 $LSTM$ 模型，计算对应每个时间步隐藏层状态 $h$ 的首元素。其中输入 $X$ 矩阵的形状为 $[4, 7]$ ，即输入数据序列时间步长 $sequence\_length$ 为 $4$ ，输入数据维度 $X\_dim$ 为 $7$ 。

输入描述

一共一行数据，用于描述输入矩阵。

其中前两个为整型数据，分别为 $sequence\_length$ 行数和 $x\_dim$ 列数，后面数据为输入矩阵的参数，均为浮点数，按行平铺 $flatten$ 形式展开为一维序列，数据间以一个空格间隔。

输出描述

一共一行数据，输出每个时间步隐藏状态的首元素，按时间步顺序组成，数据之间以一个空格间隔。

数据精度要求:且均四舍五入精确到小数点后 $3$ 位，同时若尾部存在 $0$ 结尾需进行舍弃如 $0.200$ $0.310$ $0.891$ $0.007$ 需要舍弃尾部，变为 $0.2$ $0.31$ $0.891$ $0.007$ 。特殊情况： $0$ 或 $0.000$ 或 $0.00$ 或 $0.0$ 需输出为 $0.0$ 。

样例1

输入

4 7 -1.153285 -0.081943 0.464549 3.411137 0.594197 1.21088 -0.234899 -0.272196 0.279498 -0.289765 -0.826989 -0.224368 0.711969 -0.067545 0.80226 0.574793 2.458116 0.733628 0.698731 -0.816701 0.533741 -1.756603 -0.123113 -0.550757 0.273727 0.249046 -1.165406 -0.31581

输出

0.001 -0.002 0.012 -0.006

样例2

输入

4 7 -1.609352 -0.165708 -0.494005 1.980481 0.316188 -0.005439 -1.108964 0.576463 -0.048573 -0.384642 -1.112576 0.351411 0.698983 0.607453 0.364154 -0.220041 0.345962 -0.274185 -0.784176 -1.740389 1.118046 0.794949 2.249595 -0.038455 0.037336 -0.652332 1.491228 -0.248807

输出

-0.006 -0.012 -0.013 0.014

说明

样例的输入输出均为一行数据，具体格式及输出规范参考上述输入输出描述。经典LSTM模型结构实现

#P3875. 第3题-经典LSTM模型结构实现

第3题-经典LSTM模型结构实现

思路与公式

代码

题目内容

输入描述

输出描述

样例1

样例2

Status

Development

Support

About