-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfiles_merge_script.py
103 lines (81 loc) · 4.11 KB
/
files_merge_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#! files_merge_script.py
"""
This script reads all the Excel files and merges them into a single file.
--- IMPORTANT, All files must have the same scheme/structure ---
The desired output can be changed using the DESIRED_COLUMNS variable and also by modifying the sheet_name='' parameter.
"""
from datetime import datetime
import pandas as pd
import logging
import os
# Added logging configuration to replace prints statements
logging.basicConfig(filename=f'logs/{datetime.today().strftime("%H_%M_%S_%d_%m_%Y")}-files_merge_script.log',
level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')
#TODO: Insert path here, eg. "C:/User/Documents/Files"
PATH_TO_FILES = ""
#TODO: Insert the columns to keep from the files
DESIRED_COLUMNS = [
"Column 1",
"Column 2",
"Column 3",
]
def get_files_in_path(path:str)->list[str]:
"""
Function to extract the name and path for files in a folder and subfolders
"""
# Read all the files in the directory that matches the .endswith parameter, in this case Excel files (.xlsm)
excel_files = [os.path.join(root, name)
for root, dirs, files in os.walk(path)
for name in files if name.endswith('.xlsm')]
# TODO: Remove "PLACEHOLDER" with the needed word, this can be usefull if we only want to read files starting with an specific word, eg. "sales-july-2024.xlsm"
# Remove the files that doesn't start with "PLACEHOLDER" and contains '~' in its name (temporary files)
cleaned_excel_files = [file_name for file_name in excel_files
if "PLACEHOLDER" in file_name and '~' not in file_name]
# Save into the log the number of files found
logging.info(f'Found {len(cleaned_excel_files)} files.')
return cleaned_excel_files
def get_files_content(files_path:list)->list:
"""
Function to read the files found in the get_files_in_path(path:str) function.
"""
dataframes = []
for i, file in enumerate(files_path, start=1):
try:
# Read the details from the sheet 'IND' and aditional details from the sheet 'Reg'
# TODO: replace "SHEET_NAME1" and "SHEET_NAME2" with real sheet names
# the next line is used to read some specific sections of the book and ad them later to the dataframe
# specific_details = pd.read_excel(file, sheet_name='SHEET_NAME1', engine='calamine').iloc[[0,1],6]
df = pd.read_excel(file, sheet_name='SHEET_NAME2', skiprows=8, nrows=50)
# Adds two columns to the Dataframe using the data from line 60
# TODO: replace "SPECIFIC_DETAIL1" and "SPECIFIC_DETAIL2" with the name of the actual columns
# df['SPECIFIC_DETAIL1'] = specific_details.iloc[1]
# df['SPECIFIC_DETAIL2'] = specific_details.iloc[0]
# Adds the Dataframe to the list
dataframes.append(df)
# Add the number and name of the file to the logs
logging.info(f'Processed file {i}: {file}')
except Exception as e:
# In case of error, write to the logs
logging.error(f'Error processing file {file}: {e}')
return dataframes
def initiate()->None:
"""
Initialize the script:
1 - Extract the details of the files (name and path) in the selected directory
2 - Reads all the files and saves them into a list
"""
_files = get_files_in_path(PATH_TO_FILES)
df_list = get_files_content(_files)
if df_list:
# Get all the dataframes and merg them into one, drop the duplicates and then saves the final dataframe into the desired directory
final_df = pd.concat(df_list, ignore_index=True)
final_df = final_df[DESIRED_COLUMNS].drop_duplicates()
# TODO: add the actual path and dame of the file to be saved
final_df.to_excel("", index=False)
# If success, write to the logs
logging.info(f'Done {len(_files)} files processed.')
else:
# In case of error, write to the logs
logging.warning('No dataframes to concatenate')
if __name__ == '__main__':
initiate()