问题
欧洲杯刚刚结束,就像看看有没有欧洲杯的数,分析下谁是本次欧洲杯表现最好的球员。于是我就上网找了一组数据。
网盘地址(提取码:hc9s)
【1】文件结构
【2】数据空处理
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as snsdf = pd.read_csv("./data/euro2020.csv")
print(df.head(10))summary = pd.DataFrame(df.dtypes, columns=["datatype"])
summary["isnull"] = df.isnull().sum()
summary["unique"] = df.nunique()
summary["first"] = df.loc[0]
summary["second"] = df.loc[1]
summary["third"] = df.loc[2]
print(summary)print(len(df))
train_df = df.drop("Blocks", axis = 1)train_df = train_df.fillna(axis = 0, value = 0)print(train_df.columns)
输出数据列
Index(['Player', 'Country', 'Position', 'Match played', 'Goals','Right foot goals', 'Left foot goals', 'Header goals', 'Assists','Total attempts', 'On target', 'Off target', 'Woodwork', 'Shot blocks','Avg gpg', 'Fouls suffered', 'Fouls committed', 'Top speed','Passing accuracy', 'Distance covered', 'Clearence attempted','Balls recovered', 'Tackels', 'Yellow cards', 'Red cards','Minutes played'],dtype='object')
字段分析下
# Player :球员
# Country :国家
# Position :所处位置
# Match played :出场场次
# Goals :进球数
# Right foot goals :右脚进球数
# Left foot goals :左脚进球数
# Header goals :头球进球数
# Assists :助攻数
# Total attempts :总射门数
# On target :球门内
# Off target :球门外
# Woodwork :球门框上
# Shot blocks :拦截
# Avg gpg :场均得分
# Fouls suffered :被犯规数
# Fouls committed :犯规数
# Top speed :最高速度
# Passing accuracy :传球准确度
# Distance covered :奔跑距离
# Balls recovered :抢断
# Tackles :铲球
# Yellow cards :黄牌数
# Red cards :红牌数
# Minutes played :上场时长
【3】整体数据集分布
fig, ax = plt.subplots(13,2, figsize=(100,300))
for idx, col in enumerate(train_df.columns.to_list()):row_idx = idx//2col_idx = idx%2sns.countplot(data=train_df, x = col, color = "red", ax = ax[row_idx, col_idx])
plt.show()
【4】处理自己需要的数据
top_players = train_df.sort_values(["Goals"], ascending=False).iloc[:10]
train_df["gpa"] = train_df["Goals"]/train_df["Total attempts"]
attempts = train_df.sort_values(["gpa"], ascending=False).iloc[:10]
agp = train_df.sort_values(["Avg gpg"], ascending=False).iloc[:10]
passing_accuracy = train_df.sort_values(["Passing accuracy"], ascending=False).iloc[:10]
header = train_df.sort_values(["Header goals"], ascending=False).iloc[:10]
【5】总进球数
# 进球数排名top10
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = top_players, y = "Goals", palette = "Greys", hue = "Position")
plt.title("Top players on the basis of max goals")
plt.show()
# 场均进球排名top10
fig, ax = plt.subplots(2, 1,figsize=(20,10))
sns.barplot(x = "Player", data = agp, y = "Avg gpg", palette = "Greys", ax = ax[0])
sns.barplot(x = "Country", data = agp, y = "Avg gpg", palette = "Greens", ax = ax[1])
plt.show()
这个就奇怪了,直观的感受好像,意大利对获得了冠军,为啥场均进球数这么少。。意大利的比赛是多难看。。
然后其他数据就当做参考吧。
【6】其他
# 传球
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = passing_accuracy, y = "Passing accuracy", palette = "Blues")
plt.title("Top players on the basis of passing accuracy")
plt.show()# 头球
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = header, y = "Header goals", palette = "Reds", hue = "Goals")
plt.title("Top players on the basis header goals")
plt.show()# 速度
speedy = train_df.sort_values(["Top speed"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = speedy, y = "Top speed", palette = "Greens")
plt.title("Top playerson the basis of their speed")
plt.show()# 黄牌数
yelloww = train_df.sort_values(["Yellow cards"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = yelloww, y = "Yellow cards", palette = "Greens")
plt.title("Yellow card receivers")
plt.show()# 抢断
blockers = train_df.sort_values(["Shot blocks"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = blockers, y = "Shot blocks", palette = "Blues")
plt.title("Shot Blockers")
plt.show()# 出场场次
blockers = train_df.sort_values(["Match played"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = blockers, y = "Match played", palette = "Oranges")
plt.title("Matches Played")
plt.show()# 助攻
Assisters = train_df.sort_values(["Assists"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = Assisters, y = "Assists", palette = "Greens")
plt.title("Matches Played")
plt.show()# 传球不好的
Worst_passers = train_df.sort_values(["Passing accuracy"], ascending=True).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = Worst_passers, y = "Passing accuracy", palette = "Greens")
plt.title("Not good Passers")
plt.show()# 被犯规数
Foulers = train_df.sort_values(["Fouls suffered"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = Foulers, y = "Fouls suffered", palette = "Greens")
plt.title("Foul sufferers")
plt.show()# 犯规数
Foulers = train_df.sort_values(["Fouls committed"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = Foulers, y = "Fouls committed", palette = "Greens")
plt.title("Foul makers")
plt.show()
【7】总结
最后去看了下,最后欧洲杯的最佳球员是谁,有点意料之外的,尽然是意大利的守门员。。如果是这样。守门员的数据需要单独拿出来比较。这样也说明了,意大利对真的是看守门员的。。哈哈