致谢
感谢杰哥不厌其烦地回答我的提问!
还帮我把逐个写入后排序,改成保存为二维数组后排序⬆⬇
兼容性更新
PE
无法计算ImpHash的PE文件
出现无法计算ImpHash的PE文件,会造成列表的元素数量不对
解决方案
重写计算ImpHash的逻辑,写在“try except异常处理”内,再加上对ImpHash的判断
🐍Python脚本🐍
逻辑
- 遍历文件夹
1.1 获取文件MD5
1.2 获取ImpHash
1.2.1 获取到ImpHash
if判断ImpHash是否为Null
非Null保存,Null同1.2.2
1.2.2 获取不到ImpHash
提示可能非PE文件
- 将数据保存到二维数组
2.1 以二维数组的第二元素排序(默认升序)
2.2 将排序后的数组写入Excel表格
- 表格处理
3.1 对表格第2列(ImpHash值)进行合并
3.2 对单元格设置样式为上下居中
代码
#coding=utf-8
import os
import hashlib
import goto
import pefile
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Alignment
def Excel2Write(dataMD5, dataImpHash):
excel = pd.read_excel(pathExcel)
# 先创建一个DataFrame,用来增加进数据框的最后一行
newDF = pd.DataFrame({'MD5': dataMD5, 'ImpHash': dataImpHash}, index=[0])
#print(newDF)
excel = excel.append(newDF, ignore_index=True)
excel.to_excel(pathExcel, index=False)
def Excel2Merge(pathExcel, nColumn):
# 列数映射为字母
strColumn = chr(nColumn + 64)
# 加载Excel和Sheet
wb = load_workbook(pathExcel)
sheet = wb["通过ImpHash聚类"]
# 获取列中单元格的数据
listCellValue = []
i = 2
while True:
valueCell = sheet.cell(i, nColumn).value
if valueCell:
listCellValue.append(valueCell)
else:
break
i += 1
# 判断合并单元格的始末位置
cellStart = 0
flag = listCellValue[0]
for i in range(len(listCellValue)):
if listCellValue[i] != flag:
flag = listCellValue[i]
cellEnd = i - 1
if cellEnd >= cellStart:
# 合并
sheet.merge_cells(strColumn + str(cellStart + 2) + ":" + strColumn + str(cellEnd + 2))
cellStart = cellEnd + 1
if i == len(listCellValue) - 1:
cellEnd = i
sheet.merge_cells(strColumn + str(cellStart + 2) + ":" + strColumn + str(cellEnd + 2))
wb.save(pathExcel)
print("已按照" + strColumn + "列完成合并")
def Excel2View(pathExcel):
# 加载Excel和Sheet
wb = load_workbook(pathExcel)
sheet = wb["通过ImpHash聚类"]
nRows = sheet.max_row
nCols = sheet.max_column
for j in range(1, nCols + 1):
for i in range(2, nRows + 1):
theCell = sheet.cell(row=i, column=j)
theCell.alignment = Alignment(vertical='center')
wb.save(pathExcel)
print("已居中处理")
def GetFileMD5(filename):
if not os.path.isfile(filename):
# print(filename)
return
strMD5 = hashlib.md5()
f = open(filename, 'rb')
while True:
fContent = f.read()
if not fContent:
break
strMD5.update(fContent)
f.close()
return strMD5.hexdigest().upper()
def PE_isPE(filename):
try:
file = pefile.PE(filename)
if file.is_exe():
return 1
if file.is_dll():
return 2
if file.is_driver():
return 3
else:
return 0
except:
return 0
def PE_GetImpHash(filename):
file = pefile.PE(filename)
return file.get_imphash().upper()
def EnumFile(pathDir):
array = []
for home, dirs, files in os.walk(pathDir):
for pathDir in dirs:
print("文件夹:" + pathDir)
for filename in files:
pathFile = os.path.join(home, filename)
# 获取MD5
fileMD5 = GetFileMD5(pathFile)
print("文件MD5:" + fileMD5)
# 查看是否是PE文件,是则获取其ImpHash
isPE = PE_isPE(pathFile)
try:
# 尝试获取ImpHash
fileImpHash = PE_GetImpHash(pathFile)
msgImpHash = "PE文件ImpHash:" + fileImpHash
# 有一种PE无法计算ImpHash,但值为空
if (fileImpHash == ''):
fileImpHash = "非PE"
msgImpHash = pathFile + "非PE文件,请确认是否是压缩文件等情况"
except:
# 提示非PE
fileImpHash = "非PE"
msgImpHash = pathFile + "非PE文件,请确认是否是压缩文件等情况"
print(msgImpHash + "\r\n")
# 将MD5和ImpHash写入Excel表格
# Excel2Write(fileMD5, fileImpHash)
# 保存为二维数组
array.append([fileMD5, fileImpHash])
# 对二位数组维度中第2个元素进行排序
dfSorted = pd.DataFrame(sorted(array, key=lambda x: x[1]))
ws = pd.ExcelWriter(pathExcel)
dfSorted.to_excel(ws, sheet_name="通过ImpHash聚类", index=None, header=["MD5", "ImpHash"])
ws.save()
print("\r\n全部MD5和ImpHash写入完成")
# 输出的Excel路径
pathExcel = '提取MD5和ImpHash.xlsx'
# 此处输入文件夹路径
pathFolder = (r'路径')
if __name__ == '__main__':
EnumFile(pathFolder)
Excel2Merge(pathExcel, 1)
Excel2Merge(pathExcel, 2)
Excel2View(pathExcel)