描述 移动平均值
1.求不同产品 每个月以及截至当前月最近3个月的平均销售额
2.求不同产品截至当前月份的累计销售额
数据准备
mysql
CREATE TABLE sales_monthly (product VARCHAR(20),ym VARCHAR(10),amount DECIMAL(10,2) );-- 插入测试数据 INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201801', 10159.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201802', 10211.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201803', 10247.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201804', 10376.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201805', 10400.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201806', 10565.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201807', 10613.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201808', 10696.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201809', 10751.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201810', 10842.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201811', 10900.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201812', 10972.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201901', 11155.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201902', 11202.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201903', 11260.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201904', 11341.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201905', 11459.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('苹果', '201906', 11560.00);INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201801', 10138.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201802', 10194.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201803', 10328.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201804', 10322.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201805', 10481.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201806', 10502.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201807', 10589.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201808', 10681.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201809', 10798.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201810', 10829.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201811', 10913.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201812', 11056.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201901', 11161.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201902', 11173.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201903', 11288.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201904', 11408.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201905', 11469.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('香蕉', '201906', 11528.00);INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201801', 10154.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201802', 10183.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201803', 10245.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201804', 10325.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201805', 10465.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201806', 10505.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201807', 10578.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201808', 10680.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201809', 10788.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201810', 10838.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201811', 10942.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201812', 10988.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201901', 11099.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201902', 11181.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201903', 11302.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201904', 11327.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201905', 11423.00); INSERT INTO sales_monthly (product, ym, amount) VALUES ('桔子', '201906', 11524.00);
pandas
import pandas as pd# 创建模拟数据 data = {'product': ['苹果']*18 + ['香蕉']*18 + ['桔子']*18,'ym': ['201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811', '201812','201901', '201902', '201903', '201904', '201905', '201906','201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811', '201812','201901', '201902', '201903', '201904', '201905', '201906','201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811', '201812','201901', '201902', '201903', '201904', '201905', '201906'],'amount': [10159.00, 10211.00, 10247.00, 10376.00, 10400.00, 10565.00, 10613.00, 10696.00, 10751.00, 10842.00, 10900.00, 10972.00,11155.00, 11202.00, 11260.00, 11341.00, 11459.00, 11560.00,10138.00, 10194.00, 10328.00, 10322.00, 10481.00, 10502.00, 10589.00, 10681.00, 10798.00, 10829.00, 10913.00, 11056.00,11161.00, 11173.00, 11288.00, 11408.00, 11469.00, 11528.00,10154.00, 10183.00, 10245.00, 10325.00, 10465.00, 10505.00, 10578.00, 10680.00, 10788.00, 10838.00, 10942.00, 10988.00,11099.00, 11181.00, 11302.00, 11327.00, 11423.00, 11524.00] }df = pd.DataFrame(data)
分析
三个窗口函数 实现三个功能
- 第一个
avg(amount) over(partition by product order by ym rows between 2 preceding and current row)根据product分组根据ym求平均 范围是前两行到当前行
- 第二个
avg(amount) over(partition by product order by ym rows unbounded preceding)根据product、ym分组 求截止到当月的平均金额
- 第三个
sum(amount) over(partition by product order by ym rows between unbounded preceding and current row )根据product分组 求截止到当月的总金额
代码
select product,amount,ym,avg(amount) over(partition by product order by ym rows between 2 preceding and current row )r1,avg(amount) over(partition by product order by ym rows unbounded preceding) r2,sum(amount) over(partition by product order by ym rows between unbounded preceding and current row ) r3
from sales_monthly
df['count'] =df.groupby(by='product')['amount'].cumcount()
df['avg1'] = df.groupby('product').apply(lambda x: x['amount'].rolling(3, min_periods=1).mean()).reset_index(level=0, drop=True)df['sum'] = df.groupby('product')['amount'].cumsum()
df['avg2'] = df['sum']/(df['count']+1)
print(df)
总结
①
rows 是根据该行的上下行划定范围的
range是根据该行的值的邻近值划定范围(所以注意格式)
②pandas里的cumsum函数的积累 求累计和
pands求近三行数据用rolling(window=3,min_periods=1)
window
:指定窗口的大小,即参与计算的连续数据点的数量。min_periods
:指定窗口中至少需要有多少个非缺失值数据点才进行计算,默认为None
,表示窗口大小的所有数据点都必须存在才进行计算。center
:如果为True
,则将窗口的标签设置为居中在当前位置。默认是窗口的右边界与当前位置对齐。win_type
:指定窗口的类型,可以是各种加权窗口函数,如矩形窗、三角窗等。默认为None
,表示使用等权重的矩形窗。
描述
查找短期之内(5天)累计转账超过100万元的账户
数据准备
CREATE TABLE transfer_log (log_id int, -- 交易日志编号log_ts TIMESTAMP NOT NULL, -- 交易时间from_user VARCHAR(50) NOT NULL, -- 交易发起账号to_user VARCHAR(50), -- 交易接收账号type VARCHAR(10) NOT NULL, -- 交易类型amount float NOT NULL -- 交易金额(元),保留两位小数
);-- 插入测试数据
INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (1, '2021-01-02 10:31:40', '62221234567890', NULL, '存款', 50000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (2, '2021-01-02 10:32:15', '62221234567890', NULL, '存款', 100000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (3, '2021-01-03 08:14:29', '62221234567890', '62226666666666', '转账', 200000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (4, '2021-01-05 13:55:38', '62221234567890', '62226666666666', '转账', 150000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (5, '2021-01-07 20:00:31', '62221234567890', '62227777777777', '转账', 300000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (6, '2021-01-09 17:28:07', '62221234567890', '62227777777777', '转账', 500000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (7, '2021-01-10 07:46:02', '62221234567890', '62227777777777', '转账', 100000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (8, '2021-01-11 09:36:53', '62221234567890', NULL, '存款', 40000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (9, '2021-01-12 07:10:01', '62221234567890', '62228888888881', '转账', 10000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (10, '2021-01-12 07:11:12', '62221234567890', '62228888888882', '转账', 8000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (11, '2021-01-12 07:12:36', '62221234567890', '62228888888883', '转账', 5000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (12, '2021-01-12 07:13:55', '62221234567890', '62228888888884', '转账', 6000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (13, '2021-01-12 07:14:24', '62221234567890', '62228888888885', '转账', 7000);INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount)
VALUES (14, '2021-01-21 12:11:16', '62221234567890', '62228888888885', '转账', 70000);
分析
通过sum(amount) over (partition by from_user order by log_ts range interval 5 day preceding)即可求出最近五天的总额
代码
with t1 as (select *,sum(amount) over (partition by from_user order by log_ts range interval 5 day preceding) r2from transfer_logwhere type = '转账')select *from t1where r2 > 1000000;