From cc921c7fcb989877a2c872f8c7ecea13e46814d8 Mon Sep 17 00:00:00 2001
From: buladou <1121031509@qq.com>
Date: Fri, 19 Aug 2022 16:28:08 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=8C=87=E5=AE=9A=E7=9B=AE?=
 =?UTF-8?q?=E5=BD=95=E4=B8=8B=E7=9A=84=E6=89=80=E6=9C=89excel=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6=E6=95=B0=E6=8D=AE=E6=A3=80=E7=B4=A2=E5=92=8C=E7=BB=93?=
 =?UTF-8?q?=E6=9E=9C=E8=BF=94=E5=9B=9E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 contributors/bulabean/SearchExcel.py | 137 +++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 contributors/bulabean/SearchExcel.py

diff --git a/contributors/bulabean/SearchExcel.py b/contributors/bulabean/SearchExcel.py
new file mode 100644
index 0000000..9389d6f
--- /dev/null
+++ b/contributors/bulabean/SearchExcel.py
@@ -0,0 +1,137 @@
+import os
+import openpyxl
+import xlrd
+import datetime
+
+
+def change_datatype(row_data: list):
+    """
+    excel单元格的内容类型检测和转换
+    参数：
+        row_data：行数据，列表格式
+    """
+    result_data = []
+    for rd in row_data:
+        if type(rd) == datetime.datetime:
+            t = rd.strftime("%Y-%m-%d %H:%M:%S")
+        elif type(rd) == str:
+            t = rd
+        elif type(rd) == int:
+            t = str(rd)
+        elif type(rd) == float:
+            t = str(rd)
+        elif type(rd) is None:
+            t = ''
+        else:
+            t = str(rd)
+        result_data.append(t)
+    return result_data
+
+
+def find_key(search_key: str, row_content: str):
+    """
+    检测关键词和内容
+    参数：
+        search_key：关键词
+        row_content：行内容
+    """
+    if search_key in row_content:
+        return True
+    else:
+        return False
+
+
+def process_xls(path, file):
+    """
+    读取xls后缀的excel文件
+    参数：
+        path：文件所在路径
+        file：文件名
+    """
+    filepath = os.path.join(path, file)
+    try:
+        rb = xlrd.open_workbook(filepath, formatting_info=True)
+    except:
+        return False
+    sheet_names = rb.sheet_names()
+    space_line = 0
+    for ws_name in sheet_names:
+        ws = rb.sheet_by_name(ws_name)
+        rows = ws.nrows
+        cols = ws.ncols
+        for r in range(rows):
+            values = [ws.cell(r, c).value for c in range(cols)]
+            values = change_datatype(values)
+            values = " ".join(values)
+            if values:
+                yield filepath, ws_name, r, values  # 文件路径，工作表名，行数，行内容
+            else:
+                if space_line < 10:
+                    space_line += 1
+                else:
+                    break
+
+
+def process_xlsx(path, file):
+    """
+    读取xlsx后缀的excel文件
+    参数：
+        path：文件所在路径
+        file：文件名
+    """
+    filepath = os.path.join(path, file)
+    try:
+        wb = openpyxl.load_workbook(filepath, read_only=True, data_only=True)
+    except:
+        return False
+    worksheets_name = wb.sheetnames
+    space_line = 0
+    for ws_name in worksheets_name:
+        ws = wb[ws_name]
+        for index, row in enumerate(ws.rows):
+            values = [r.value for r in row if r.value != None]
+            values = change_datatype(values)
+            values = " ".join(values)
+            if values:
+                yield filepath, ws_name, index, values  # 文件路径，工作表名，行数，行内容
+            else:
+                if space_line < 10:
+                    space_line += 1
+                else:
+                    break
+
+
+def find_excel_data(search_key: str, target_dir:str):
+    """
+    检索指定目录下的excel文件和过滤
+    参数：
+        search_key：检索的关键词
+        target_dir：目标文件夹
+    """
+    for path, dirs, files in os.walk(target_dir):
+        files = [file for file in files if not file.startswith('~$')]  # 过滤掉正打开的excel文件
+        xls_files = [file for file in files if file.endswith('.xls')]  # 取出所有的xls后缀文件
+        xlsx_files = [file for file in files if file.endswith('.xlsx')]  # 取出所有的xlsx后缀文件
+        for xls in xls_files:
+            for data in process_xls(path, xls):
+                filepath, ws_name, index, values = data
+                status = find_key(search_key, values)
+                if status:
+                    yield filepath, ws_name, index, values
+        for xlsx in xlsx_files:
+            for data in process_xlsx(path, xlsx):
+                filepath, ws_name, index, values = data
+                status = find_key(search_key, values)
+                if status:
+                    yield filepath, ws_name, index, values
+
+
+if __name__ == '__main__':
+    import time
+    time1 = time.time()
+    search_key = '胡'
+    target_dir = '/Users/buladou/workspace/testdata/8.12'
+    for data in find_excel_data(search_key, target_dir):
+        print(list(data))
+    time2 = time.time()
+    print("\n程序运行结束，停止运行。{}".format(round(time2-time1, 2)))
\ No newline at end of file
-- 
Gitee