208 lines
5 KiB
Python
208 lines
5 KiB
Python
|
#!/usr/bin/env python
|
||
|
# A file folder scanner contributed by @holger
|
||
|
#
|
||
|
# You can spicify the scanned folder and file pattern by changing rootPath
|
||
|
# and pattern variables
|
||
|
#
|
||
|
|
||
|
__author__ = "holger"
|
||
|
|
||
|
from treelib import tree
|
||
|
|
||
|
import fnmatch
|
||
|
import os
|
||
|
import zlib
|
||
|
import argparse
|
||
|
|
||
|
DEBUG = 0
|
||
|
FILECOUNT = 0
|
||
|
DIRCOUNT = 0
|
||
|
DIR_ERRORLIST = []
|
||
|
FILE_ERRORLIST = []
|
||
|
|
||
|
|
||
|
# Time Profiling
|
||
|
PROFILING = 0
|
||
|
# 0 - nothing
|
||
|
# 1 - time
|
||
|
# 2 - cProfile
|
||
|
|
||
|
if PROFILING == 1:
|
||
|
import timeit
|
||
|
if PROFILING == 2:
|
||
|
import cProfile
|
||
|
|
||
|
|
||
|
parser = argparse.ArgumentParser(
|
||
|
description="Scan the given folder and print its structure in a tree."
|
||
|
)
|
||
|
parser.add_argument("abspath", type=str, help="An absolute path to be scanned.")
|
||
|
parser.add_argument(
|
||
|
"pattern", type=str, help="File name pattern to filtered, e.g. *.pdf"
|
||
|
)
|
||
|
|
||
|
args = parser.parse_args()
|
||
|
rootPath = args.abspath
|
||
|
pattern = args.pattern
|
||
|
|
||
|
folder_blacklist = []
|
||
|
|
||
|
dir_tree = tree.Tree()
|
||
|
dir_tree.create_node("Root", rootPath) # root node
|
||
|
|
||
|
|
||
|
def crc32(data):
|
||
|
data = bytes(data, "UTF-8")
|
||
|
|
||
|
if DEBUG:
|
||
|
print("++++++ CRC32 ++++++")
|
||
|
print("input: " + str(data))
|
||
|
print("crc32: " + hex(zlib.crc32(data) & 0xFFFFFFFF))
|
||
|
print("+++++++++++++++++++")
|
||
|
return hex(
|
||
|
zlib.crc32(data) & 0xFFFFFFFF
|
||
|
) # crc32 returns a signed value, &-ing it will match py3k
|
||
|
|
||
|
|
||
|
parent = rootPath
|
||
|
i = 1
|
||
|
|
||
|
# calculating start depth
|
||
|
start_depth = rootPath.count("/")
|
||
|
|
||
|
|
||
|
def get_noteid(depth, root, dir):
|
||
|
"""get_noteid returns
|
||
|
- depth contains the current depth of the folder hierarchy
|
||
|
- dir contains the current directory
|
||
|
|
||
|
Function returns a string containing the current depth, the folder name and unique ID build by hashing the
|
||
|
absolute path of the directory. All spaces are replaced by '_'
|
||
|
|
||
|
<depth>_<dirname>+++<crc32>
|
||
|
e.g. 2_Folder_XYZ_1+++<crc32>
|
||
|
"""
|
||
|
return (
|
||
|
str(str(depth) + "_" + dir).replace(" ", "_")
|
||
|
+ "+++"
|
||
|
+ crc32(os.path.join(root, dir))
|
||
|
)
|
||
|
|
||
|
|
||
|
# TODO: Verzeichnistiefe pruefen: Was ist mit sowas /mp3/
|
||
|
|
||
|
|
||
|
def get_parentid(current_depth, root, dir):
|
||
|
# special case for the 'root' of the tree
|
||
|
# because we don't want a cryptic root-name
|
||
|
if current_depth == 0:
|
||
|
return root
|
||
|
|
||
|
# looking for parent directory
|
||
|
# e.g. /home/user1/mp3/folder1/parent_folder/current_folder
|
||
|
# get 'parent_folder'
|
||
|
|
||
|
search_string = os.path.join(root, dir)
|
||
|
pos2 = search_string.rfind("/")
|
||
|
pos1 = search_string.rfind("/", 0, pos2)
|
||
|
parent_dir = search_string[pos1 + 1 : pos2] # noqa: E203
|
||
|
parentid = (
|
||
|
str(current_depth - 1)
|
||
|
+ "_"
|
||
|
+ parent_dir.replace(" ", "_")
|
||
|
+ "+++"
|
||
|
+ crc32(root)
|
||
|
)
|
||
|
return parentid
|
||
|
# TODO: catch error
|
||
|
|
||
|
|
||
|
def print_node(dir, node_id, parent_id):
|
||
|
print("#############################")
|
||
|
print("node created")
|
||
|
print(" dir: " + dir)
|
||
|
print(" note_id: " + node_id)
|
||
|
print(" parent: " + parent_id)
|
||
|
|
||
|
|
||
|
def crawler():
|
||
|
global DIRCOUNT
|
||
|
global FILECOUNT
|
||
|
|
||
|
for root, dirs, files in os.walk(rootPath):
|
||
|
# +++ DIRECTORIES +++
|
||
|
for dir in dirs:
|
||
|
# calculating current depth
|
||
|
current_depth = os.path.join(root, dir).count("/") - start_depth
|
||
|
|
||
|
if DEBUG:
|
||
|
print("current: " + os.path.join(root, dir))
|
||
|
|
||
|
node_id = get_noteid(current_depth, root, dir)
|
||
|
parent_id = str(get_parentid(current_depth, root, dir))
|
||
|
|
||
|
if parent_id == str(None):
|
||
|
DIR_ERRORLIST.append(os.path.join(root, dir))
|
||
|
|
||
|
if DEBUG:
|
||
|
print_node(dir, node_id, parent_id)
|
||
|
|
||
|
# create node
|
||
|
dir_tree.create_node(dir, node_id, parent_id)
|
||
|
DIRCOUNT += 1
|
||
|
|
||
|
# +++ FILES +++
|
||
|
for filename in fnmatch.filter(files, pattern):
|
||
|
if dir in folder_blacklist:
|
||
|
continue
|
||
|
|
||
|
# calculating current depth
|
||
|
current_depth = os.path.join(root, filename).count("/") - start_depth
|
||
|
|
||
|
if DEBUG:
|
||
|
print("current: " + os.path.join(root, filename))
|
||
|
|
||
|
node_id = get_noteid(current_depth, root, filename)
|
||
|
parent_id = str(get_parentid(current_depth, root, filename))
|
||
|
|
||
|
if parent_id == str(None):
|
||
|
FILE_ERRORLIST.append(os.path.join(root, dir))
|
||
|
|
||
|
if DEBUG:
|
||
|
print_node(filename, node_id, parent_id)
|
||
|
|
||
|
# create node
|
||
|
dir_tree.create_node(filename, node_id, parent_id)
|
||
|
FILECOUNT += 1
|
||
|
|
||
|
|
||
|
if PROFILING == 0:
|
||
|
crawler()
|
||
|
if PROFILING == 1:
|
||
|
t1 = timeit.Timer("crawler()", "from __main__ import crawler")
|
||
|
print("time: " + str(t1.timeit(number=1)))
|
||
|
if PROFILING == 2:
|
||
|
cProfile.run("crawler()")
|
||
|
|
||
|
|
||
|
print("filecount: " + str(FILECOUNT))
|
||
|
print("dircount: " + str(DIRCOUNT))
|
||
|
|
||
|
if DIR_ERRORLIST:
|
||
|
for item in DIR_ERRORLIST:
|
||
|
print(item)
|
||
|
else:
|
||
|
print("no directory errors")
|
||
|
|
||
|
print("\n\n\n")
|
||
|
|
||
|
if FILE_ERRORLIST:
|
||
|
for item in FILE_ERRORLIST:
|
||
|
print(item)
|
||
|
else:
|
||
|
print("no file errors")
|
||
|
|
||
|
print("nodes: " + str(len(dir_tree.nodes)))
|
||
|
|
||
|
dir_tree.show()
|