Source code for lotecc.lotecc

import os
import re
from fnmatch import fnmatch

from opencc import OpenCC
from pydantic import BaseModel, validator


[docs]def check_file_exist(filename: str) -> str: """ Check if the file exists in the current directory or the upper directories. :param filename: str, the filename. :return: str, a abspath of the file if exist, else a empty string. """ if os.path.isfile(filename): return os.path.abspath(filename) else: dirs = os.path.split(os.getcwd()) for i in range(len(dirs), 0, -1): full_path = os.path.join(*dirs[:i], filename) if os.path.isfile(full_path): return full_path return ''
[docs]def read_ignores(ignore_file: str) -> list: """ Read ignore patterns from a .gitignore syntax file. :param ignore_file: str, the ignore file, may not be the correct path. :return: list, the ignore patterns. """ ignore_file_path = check_file_exist(ignore_file) if not ignore_file_path: return [] with open(ignore_file_path, 'r') as f: lines = f.readlines() pt = re.compile(r'\S') ignores = [l.strip().replace('/', '') for l in lines if pt.match(l) and not l.strip().startswith('#')] ignores.append('.git') return ignores
[docs]def get_list_of_files(dir_name: str, ignores: list) -> list: """ Get all files in a directory excluding ignored files. :param dir_name: str, the root directory. :param ignores: str, the patterns to exclude. :return: list, the files excluding ignored files. """ list_of_file = os.listdir(dir_name) all_files = [] for entry in list_of_file: full_path = os.path.abspath(os.path.join(dir_name, entry)) for pattern in ignores: if fnmatch(os.path.split(full_path)[-1], pattern): break else: if os.path.isdir(full_path): all_files = all_files + get_list_of_files(full_path, ignores) else: all_files.append(full_path) return all_files
[docs]class LoteccConfig(BaseModel): conversion: str = 's2t' input: str = '.' output: str = None in_enc: str = 'utf-8' out_enc: str = 'utf-8' suffix: str = None ignore: str = '.gitignore' @property def ignore_patterns(self) -> list: """ Read ignore patterns from file, or get from input string. :return: list, the .gitignore syntax patterns. """ return read_ignores(self.ignore) or self.ignore.split(',') @property def input_files(self) -> list: """ List files filtered by ignore patterns in the input directory, or a input file. :return: list, the files to convert. """ if os.path.isdir(self.input): return get_list_of_files(self.input, self.ignore_patterns) elif os.path.isfile(self.input): return [os.path.abspath(self.input)] else: raise ValueError(f'<{self.input}> is not a file or directory')
[docs] @validator('conversion') def valid_conversion(cls, v: str) -> str: """ Validate argument of conversion. :param v: str, the value of conversion. :return: str, the valid value of conversion. :raise: raise a ValueError if conversion is not supported. """ if v.lower() in ['hk2s', 's2hk', 's2t', 's2tw', 's2twp', 't2hk', 't2s', 't2tw', 'tw2s', 'tw2sp']: return v.lower() else: raise ValueError(f'Error: conversion <{v}> not support')
[docs]def lote_chinese_conversion(**kwargs): """ Convert files between Simplified Chinese and Traditional Chinese. :param conversion: str, default 's2t', the conversion method. :param input: str, default '.', an input file or a directory. :param output: str, default None, an output file or a directory. :param in_enc: str, default 'utf-8', encoding for input. :param out_enc: str, default 'utf-8', encoding for output. :param suffix: str, default None, suffix of output filename. :param ignore: str, default '.gitignore', a .gitignore syntax file, or patterns, separated by commas. :return: list, list of tuples, a tuple contains source file and converted file. """ config = LoteccConfig(**kwargs) cc = OpenCC(config.conversion) converted = [] for input_file in config.input_files: try: with open(input_file, encoding=config.in_enc) as f: input_str = f.read() except UnicodeDecodeError: continue output_str = cc.convert(input_str) output_file = input_file if config.output: if os.path.isdir(config.output): output_file = os.path.abspath( os.path.join(config.output,os.path.basename(input_file))) else: output_file = os.path.abspath(config.output) if config.suffix: name, extension = os.path.splitext(output_file) output_file = name + config.suffix + extension if output_str == input_str: continue with open(output_file, 'w', encoding=config.out_enc) as f: f.write(output_str) converted.append((input_file, output_file)) return converted