stock.csv
    movie_data_cleaned.csv

    1. '''
    2. file: pandas_def.py
    3. '''
    4. import pandas as pd
    5. import numpy as np
    6. pd.set_option('display.max_columns', None)
    7. pd.set_option('display.max_rows', None)
    8. file = "movie_data_cleaned.csv"
    9. def movie_year_amount_tj():
    10. data = pd.read_csv(file)
    11. data['release_date'] = pd.to_datetime(data['release_date'])
    12. data = data.set_index(data['release_date'])
    13. data_year_tj = data['release_date'].resample('Y').count()
    14. return pd.DataFrame(data_year_tj)
    15. def country_year_tj():
    16. data = pd.read_csv(file,
    17. usecols=['title', 'country', 'language', 'release_date', 'average'])
    18. data = data[['title', 'country', 'language', 'release_date', 'average']]
    19. # 各国每年的电影产量
    20. data['country'] = data['country'].str.strip(' ')
    21. data['country'] = data['country'].fillna(value='')
    22. country_list = []
    23. for c in data['country']:
    24. c_list = c.split(' / ')
    25. for label in c_list: # 123,2,3 -> 1,2,3
    26. country_list.append(label)
    27. country_list = list(set(country_list))
    28. country_list.remove('')
    29. country_list.remove('美国/澳大利亚')
    30. country_list.remove('捷克斯洛伐克/捷克')
    31. country_list.remove('中国大陆') # 只统计中国
    32. country_list.remove('中国香港')
    33. country_list.remove('中国台湾')
    34. data['release_date'] = pd.to_datetime(data['release_date'])
    35. data = data.set_index(data['release_date'])
    36. count = 0
    37. tj = pd.DataFrame(data['release_date'].resample('Y').count())
    38. tj = tj.drop(columns='release_date')
    39. for label in country_list:
    40. temp = data[data['country'].str.contains(label)]
    41. print("=====================================")
    42. print("标签=", label)
    43. print("总频数=", len(temp))
    44. count += len(temp)
    45. tj[label] = temp['release_date'].resample('Y').count()
    46. tj = tj.fillna(value=0)
    47. return tj
    48. def language_tj():
    49. data = pd.read_csv(file,
    50. usecols=['title', 'country', 'language', 'release_date', 'average'])
    51. data = data[['title', 'country', 'language', 'release_date', 'average']]
    52. # label统计 -> list
    53. data['language'] = data['language'].str.strip(' ')
    54. data['language'] = data['language'].fillna(value='')
    55. lang_list = []
    56. for l in data['language']:
    57. l_list = l.split(' / ')
    58. for label in l_list: # 123,2,3 -> 1,2,3
    59. lang_list.append(label)
    60. lang_list = list(set(lang_list))
    61. lang_list.remove('')
    62. lang_list.remove('汉语普通话')
    63. # 统计每个类型标签对应的电影量/条数/频数
    64. data_lang_tj = pd.DataFrame(np.zeros([len(lang_list), 1]),
    65. index=lang_list, columns=['tj']) # 2列:标签,统计值tj
    66. for i in data['language']:
    67. for label in lang_list:
    68. if str(i).__contains__(label):
    69. data_lang_tj.loc[label, 'tj'] += 1
    70. # 将小类汇总为大类,并添加至统计df
    71. chinese_fy = data_lang_tj.loc['湖南话', 'tj'] + data_lang_tj.loc['北京话', 'tj']
    72. # print(chinese_fy)
    73. data_lang_tj.loc['中国方言', 'tj'] = chinese_fy
    74. return data_lang_tj
    75. def averge_votes():
    76. return pd.read_csv(file, usecols=['average', 'votes', 'title'])
    77. def genre_tj():
    78. # 读取数据
    79. data = pd.read_csv(file)
    80. # 获取所有类型:提取每一行的genre元素 -> 新的列表 genre_list -> 去重
    81. data['genre'] = data['genre'].str.strip('[')
    82. data['genre'] = data['genre'].str.strip(']')
    83. data['genre'] = data['genre'].fillna(value='')
    84. genre_list = []
    85. for g in data['genre']:
    86. g_list = g.split(', ')
    87. for label in g_list: # 123,2,3 -> 1,2,3
    88. genre_list.append(label)
    89. g_list = list(set(genre_list))
    90. g_list.remove('')
    91. # 统计每个类型标签对应的电影量/条数/频数
    92. data_genre_tj = pd.DataFrame(np.zeros([len(g_list), 1]),
    93. index=g_list, columns=['tj']) # 2列:标签,统计值tj
    94. for i in data['genre']:
    95. for label in g_list:
    96. if str(i).__contains__(label):
    97. data_genre_tj.loc[label, 'tj'] += 1
    98. return data_genre_tj
    99. def genre_rates_tj(x):
    100. '''
    101. :param x: 前X为
    102. :return: 排名前X位的电影类型标签,对应的评分均值数据
    103. '''
    104. # 电影类型(x):6个标签,电影数量最多的
    105. # 评分数据(y):2,3,4,5,6,7,8,9,10
    106. # 电影数量(值)
    107. genre_list = genre_tj().sort_values('tj', ascending=False) \
    108. .head(x).index.tolist()
    109. rate_list = [2, 3, 4, 5, 6, 7, 8, 9, 10]
    110. data = pd.read_csv(file, usecols=['genre', 'average'])
    111. # 统计满足类型与评分区间的数据
    112. data_genre_tj = pd.DataFrame(np.zeros([len(rate_list), len(genre_list)]),
    113. index=rate_list, columns=genre_list)
    114. for g in genre_list: # 循环遍历6个变迁元素
    115. for rate in rate_list: # 循环遍历2-10评分数值
    116. for i, r in zip(data['genre'], data['average']):
    117. if str(i).__contains__(g) and rate <= r < rate + 1:
    118. data_genre_tj.loc[rate, g] += 1
    119. return data_genre_tj
    120. def rate_tj_by_year(year_list):
    121. data = pd.read_csv(file,
    122. usecols=['title', 'average', 'release_date'])
    123. data = data.set_index(pd.to_datetime(data['release_date']))
    124. tj = []
    125. for year in year_list:
    126. tj.append(data[year]['average'].tolist())
    127. return tj
    128. if __name__ == '__main__':
    129. # print(movie_year_amount_tj())
    130. # print(genre_rates_tj(6))
    131. print(rate_tj_by_year(['2015', '2016']))