致谢

感谢杰哥不厌其烦地回答我的提问!
还帮我把逐个写入后排序,改成保存为二维数组后排序⬆⬇

兼容性更新

PE

无法计算ImpHash的PE文件

出现无法计算ImpHash的PE文件,会造成列表的元素数量不对
image.png

解决方案

重写计算ImpHash的逻辑,写在“try except异常处理”内,再加上对ImpHash的判断

🐍Python脚本🐍

逻辑

image.png

  1. 遍历文件夹

1.1 获取文件MD5
1.2 获取ImpHash
1.2.1 获取到ImpHash
if判断ImpHash是否为Null
非Null保存,Null同1.2.2
1.2.2 获取不到ImpHash
提示可能非PE文件

  1. 将数据保存到二维数组

2.1 以二维数组的第二元素排序(默认升序)
2.2 将排序后的数组写入Excel表格

  1. 表格处理

3.1 对表格第2列(ImpHash值)进行合并
3.2 对单元格设置样式为上下居中

代码

  1. #coding=utf-8
  2. import os
  3. import hashlib
  4. import goto
  5. import pefile
  6. import pandas as pd
  7. from openpyxl import load_workbook
  8. from openpyxl.styles import Alignment
  9. def Excel2Write(dataMD5, dataImpHash):
  10. excel = pd.read_excel(pathExcel)
  11. # 先创建一个DataFrame,用来增加进数据框的最后一行
  12. newDF = pd.DataFrame({'MD5': dataMD5, 'ImpHash': dataImpHash}, index=[0])
  13. #print(newDF)
  14. excel = excel.append(newDF, ignore_index=True)
  15. excel.to_excel(pathExcel, index=False)
  16. def Excel2Merge(pathExcel, nColumn):
  17. # 列数映射为字母
  18. strColumn = chr(nColumn + 64)
  19. # 加载Excel和Sheet
  20. wb = load_workbook(pathExcel)
  21. sheet = wb["通过ImpHash聚类"]
  22. # 获取列中单元格的数据
  23. listCellValue = []
  24. i = 2
  25. while True:
  26. valueCell = sheet.cell(i, nColumn).value
  27. if valueCell:
  28. listCellValue.append(valueCell)
  29. else:
  30. break
  31. i += 1
  32. # 判断合并单元格的始末位置
  33. cellStart = 0
  34. flag = listCellValue[0]
  35. for i in range(len(listCellValue)):
  36. if listCellValue[i] != flag:
  37. flag = listCellValue[i]
  38. cellEnd = i - 1
  39. if cellEnd >= cellStart:
  40. # 合并
  41. sheet.merge_cells(strColumn + str(cellStart + 2) + ":" + strColumn + str(cellEnd + 2))
  42. cellStart = cellEnd + 1
  43. if i == len(listCellValue) - 1:
  44. cellEnd = i
  45. sheet.merge_cells(strColumn + str(cellStart + 2) + ":" + strColumn + str(cellEnd + 2))
  46. wb.save(pathExcel)
  47. print("已按照" + strColumn + "列完成合并")
  48. def Excel2View(pathExcel):
  49. # 加载Excel和Sheet
  50. wb = load_workbook(pathExcel)
  51. sheet = wb["通过ImpHash聚类"]
  52. nRows = sheet.max_row
  53. nCols = sheet.max_column
  54. for j in range(1, nCols + 1):
  55. for i in range(2, nRows + 1):
  56. theCell = sheet.cell(row=i, column=j)
  57. theCell.alignment = Alignment(vertical='center')
  58. wb.save(pathExcel)
  59. print("已居中处理")
  60. def GetFileMD5(filename):
  61. if not os.path.isfile(filename):
  62. # print(filename)
  63. return
  64. strMD5 = hashlib.md5()
  65. f = open(filename, 'rb')
  66. while True:
  67. fContent = f.read()
  68. if not fContent:
  69. break
  70. strMD5.update(fContent)
  71. f.close()
  72. return strMD5.hexdigest().upper()
  73. def PE_isPE(filename):
  74. try:
  75. file = pefile.PE(filename)
  76. if file.is_exe():
  77. return 1
  78. if file.is_dll():
  79. return 2
  80. if file.is_driver():
  81. return 3
  82. else:
  83. return 0
  84. except:
  85. return 0
  86. def PE_GetImpHash(filename):
  87. file = pefile.PE(filename)
  88. return file.get_imphash().upper()
  89. def EnumFile(pathDir):
  90. array = []
  91. for home, dirs, files in os.walk(pathDir):
  92. for pathDir in dirs:
  93. print("文件夹:" + pathDir)
  94. for filename in files:
  95. pathFile = os.path.join(home, filename)
  96. # 获取MD5
  97. fileMD5 = GetFileMD5(pathFile)
  98. print("文件MD5:" + fileMD5)
  99. # 查看是否是PE文件,是则获取其ImpHash
  100. isPE = PE_isPE(pathFile)
  101. try:
  102. # 尝试获取ImpHash
  103. fileImpHash = PE_GetImpHash(pathFile)
  104. msgImpHash = "PE文件ImpHash:" + fileImpHash
  105. # 有一种PE无法计算ImpHash,但值为空
  106. if (fileImpHash == ''):
  107. fileImpHash = "非PE"
  108. msgImpHash = pathFile + "非PE文件,请确认是否是压缩文件等情况"
  109. except:
  110. # 提示非PE
  111. fileImpHash = "非PE"
  112. msgImpHash = pathFile + "非PE文件,请确认是否是压缩文件等情况"
  113. print(msgImpHash + "\r\n")
  114. # 将MD5和ImpHash写入Excel表格
  115. # Excel2Write(fileMD5, fileImpHash)
  116. # 保存为二维数组
  117. array.append([fileMD5, fileImpHash])
  118. # 对二位数组维度中第2个元素进行排序
  119. dfSorted = pd.DataFrame(sorted(array, key=lambda x: x[1]))
  120. ws = pd.ExcelWriter(pathExcel)
  121. dfSorted.to_excel(ws, sheet_name="通过ImpHash聚类", index=None, header=["MD5", "ImpHash"])
  122. ws.save()
  123. print("\r\n全部MD5和ImpHash写入完成")
  124. # 输出的Excel路径
  125. pathExcel = '提取MD5和ImpHash.xlsx'
  126. # 此处输入文件夹路径
  127. pathFolder = (r'路径')
  128. if __name__ == '__main__':
  129. EnumFile(pathFolder)
  130. Excel2Merge(pathExcel, 1)
  131. Excel2Merge(pathExcel, 2)
  132. Excel2View(pathExcel)