Skip to content

Commit 64324b3

Browse files
committed
Style tweaks
1 parent 15598ba commit 64324b3

File tree

8 files changed

+104
-107
lines changed

8 files changed

+104
-107
lines changed

clean

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,14 @@ def main():
1616
parser.add_argument("targets", metavar="TARGET", nargs="+", help="an XHTML or SVG file, or a directory containing XHTML or SVG files")
1717
args = parser.parse_args()
1818

19-
#Check for required utilities
19+
# Check for required utilities
2020
xmllint_path = shutil.which("xmllint")
2121

2222
if xmllint_path is None:
2323
se.print_error("Couldn't locate xmllint. Is it installed?")
2424
exit(1)
2525

26-
#Tell xmllint to indent with tabs using an environmental variable
26+
# Tell xmllint to indent with tabs using an environmental variable
2727
env = os.environ.copy()
2828
env["XMLLINT_INDENT"] = "\t"
2929

@@ -44,7 +44,7 @@ def main():
4444
target_filenames.add(target)
4545

4646
for filename in target_filenames:
47-
#If we're cleaning a directory and setting single lines, skip the colophon, which has special spacing
47+
# If we're cleaning a directory and setting single lines, skip the colophon, which has special spacing
4848
if args.single_lines and filename.endswith("colophon.xhtml") and os.path.isdir(target):
4949
continue
5050

@@ -56,15 +56,15 @@ def main():
5656
processed_xhtml = processed_xhtml.replace("\n", " ")
5757
processed_xhtml = regex.sub(r"\s+", " ", processed_xhtml)
5858

59-
#Epub3 doesn't allow named entities, so convert them to their unicode equivalents
60-
#But, don't unescape the content.opf long-description accidentally
59+
# Epub3 doesn't allow named entities, so convert them to their unicode equivalents
60+
# But, don't unescape the content.opf long-description accidentally
6161
if not filename.endswith("content.opf"):
6262
processed_xhtml = html.unescape(processed_xhtml).replace("&", "&")
6363

64-
#Remove unnecessary doctypes which can cause xmllint to hang
64+
# Remove unnecessary doctypes which can cause xmllint to hang
6565
processed_xhtml = regex.sub(r"<!DOCTYPE[^>]+?>", "", processed_xhtml, flags=regex.MULTILINE | regex.DOTALL)
6666

67-
#First, canonicalize XHTML
67+
# First, canonicalize XHTML
6868
result = subprocess.run([xmllint_path, "--c14n", "-"], input=processed_xhtml.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
6969
processed_xhtml = result.stdout.decode()
7070
error = result.stderr.decode().strip()
@@ -73,10 +73,10 @@ def main():
7373
se.print_error("Couldn't parse {}; files must be in XHTML format, which is not the same as HTML\n{}".format(filename, error.replace("-:", "Line ")))
7474
exit(1)
7575

76-
#Next, add the XML header that xmllint stripped during c14n
76+
# Next, add the XML header that xmllint stripped during c14n
7777
processed_xhtml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + processed_xhtml
7878

79-
#Next, pretty-print XML
79+
# Next, pretty-print XML
8080
processed_xhtml = subprocess.run([xmllint_path, "--format", "-"], input=processed_xhtml.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env).stdout.decode()
8181

8282
if processed_xhtml != xhtml:

interactive-sr

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@ EOF
1717
}
1818
require(){ command -v $1 > /dev/null 2>&1 || { suggestion=""; if [ ! -z "$2" ]; then suggestion=" $2"; fi; die "$1 is not installed.${suggestion}"; } }
1919
if [ $# -eq 1 ]; then if [ "$1" = "--help" -o "$1" = "-h" ]; then usage; fi fi
20-
#End boilerplate
20+
# End boilerplate
2121

2222
require "vim" "Try: apt-get install vim"
2323

24-
#Store the regex
24+
# Store the regex
2525
regex="$1"
2626

27-
#Remove the regex from the list of arguments
27+
# Remove the regex from the list of arguments
2828
shift
2929

30-
#'set title' shows the filename in the terminal title
31-
#'set eventignore-=Syntax' enables syntax highlighting in all files
32-
#'wqa writes and quits all buffers
30+
# 'set title' shows the filename in the terminal title
31+
# 'set eventignore-=Syntax' enables syntax highlighting in all files
32+
# 'wqa writes and quits all buffers
3333
vim "+silent set title" "+silent bufdo set eventignore-=Syntax | %s${regex}gce | silent update" "+silent qa" $@

lint

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def main():
3636
if args.verbose or (messages and len(args.directories) > 1):
3737
print(colored(se_epub.directory, "white", attrs=["reverse"]))
3838

39-
#Print the table
39+
# Print the table
4040
if messages:
4141
for message in messages:
4242
if message.is_submessage:

modernize-spelling

Lines changed: 57 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -7,64 +7,6 @@ import regex
77
import se
88

99

10-
DICTIONARY_FILE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "words")
11-
12-
13-
def main():
14-
parser = argparse.ArgumentParser(description="Modernize spelling of some archaic words, and replace words that may be archaically compounded with a dash to a more modern spelling. For example, replace \"ash-tray\" with \"ashtray\".")
15-
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
16-
parser.add_argument("-n", "--no-hyphens", dest="modernize_hyphenation", action="store_false", help="don't modernize hyphenation")
17-
parser.add_argument("targets", metavar="TARGET", nargs="+", help="an XHTML file, or a directory containing XHTML files")
18-
args = parser.parse_args()
19-
20-
try:
21-
dictionary = set(line.strip().lower() for line in open(DICTIONARY_FILE_PATH))
22-
except Exception:
23-
se.print_error("Couldn't open words file at {}".format(DICTIONARY_FILE_PATH))
24-
exit(1)
25-
26-
for target in args.targets:
27-
target = os.path.abspath(target)
28-
29-
if args.verbose:
30-
print("Processing {} ...".format(target), end="", flush=True)
31-
32-
target_filenames = set()
33-
if os.path.isdir(target):
34-
for root, _, filenames in os.walk(target):
35-
for filename in fnmatch.filter(filenames, "*.xhtml"):
36-
target_filenames.add(os.path.join(root, filename))
37-
else:
38-
target_filenames.add(target)
39-
40-
41-
for filename in target_filenames:
42-
with open(filename, "r+", encoding="utf-8") as file:
43-
xhtml = file.read()
44-
new_xhtml = xhtml
45-
46-
# What language are we using?
47-
language = regex.search(r"<html[^>]+?xml:lang=\"([^\"]+)\"", xhtml)
48-
if language is None or (language.group(1) != "en-US" and language.group(1) != "en-GB"):
49-
if args.verbose:
50-
print("\n\t", end="", flush=True)
51-
se.print_error("No valid xml:lang attribute in <html> root. Only en-US and en-GB are supported. File: {}".format(filename))
52-
exit(1)
53-
54-
new_xhtml = modernize_spelling(new_xhtml, language.group(1))
55-
56-
if args.modernize_hyphenation:
57-
new_xhtml = modernize_hyphenation(new_xhtml, dictionary)
58-
59-
if new_xhtml != xhtml:
60-
file.seek(0)
61-
file.write(new_xhtml)
62-
file.truncate()
63-
64-
if args.verbose:
65-
print(" OK")
66-
67-
6810
def modernize_hyphenation(xhtml, dictionary):
6911
# Easy fix for a common case
7012
xhtml = regex.sub(r"\b([Nn])ow-a-days\b", r"\1owadays", xhtml) # now-a-days -> nowadays
@@ -86,12 +28,11 @@ def modernize_hyphenation(xhtml, dictionary):
8628

8729
return xhtml
8830

89-
9031
def modernize_spelling(xhtml, language):
9132
# ADDING NEW WORDS TO THIS LIST:
9233
# A good way to check if a word is "archaic" is to do a Google N-Gram search: https://books.google.com/ngrams/graph?case_insensitive=on&year_start=1800&year_end=2000&smoothing=3
9334
# Remember that en-US and en-GB differ significantly, and just because a word might seem strange to you, doesn't mean it's not the common case in the other variant.
94-
# If Google N-Gram shows that a word has declined significantly in usage in BOTH en-US and en-GB (or the SE editor makes an exception) then it may be a good candidate to add to this list.
35+
# If Google N-Gram shows that a word has declined significantly in usage in BOTH en-US and en-GB (or the SE editor-in-chief makes an exception) then it may be a good candidate to add to this list.
9536

9637
xhtml = regex.sub(r"\b([Dd])evelope\b", r"\1evelop", xhtml) # develope -> develop
9738
xhtml = regex.sub(r"\b([Oo])ker\b", r"\1cher", xhtml) # oker -> ocher
@@ -200,6 +141,62 @@ def modernize_spelling(xhtml, language):
200141

201142
return xhtml
202143

144+
def main():
145+
parser = argparse.ArgumentParser(description="Modernize spelling of some archaic words, and replace words that may be archaically compounded with a dash to a more modern spelling. For example, replace \"ash-tray\" with \"ashtray\".")
146+
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
147+
parser.add_argument("-n", "--no-hyphens", dest="modernize_hyphenation", action="store_false", help="don't modernize hyphenation")
148+
parser.add_argument("targets", metavar="TARGET", nargs="+", help="an XHTML file, or a directory containing XHTML files")
149+
args = parser.parse_args()
150+
151+
dictionary_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "words")
152+
153+
try:
154+
dictionary = set(line.strip().lower() for line in open(dictionary_file_path))
155+
except Exception:
156+
se.print_error("Couldn't open words file at {}".format(dictionary_file_path))
157+
exit(1)
158+
159+
for target in args.targets:
160+
target = os.path.abspath(target)
161+
162+
if args.verbose:
163+
print("Processing {} ...".format(target), end="", flush=True)
164+
165+
target_filenames = set()
166+
if os.path.isdir(target):
167+
for root, _, filenames in os.walk(target):
168+
for filename in fnmatch.filter(filenames, "*.xhtml"):
169+
target_filenames.add(os.path.join(root, filename))
170+
else:
171+
target_filenames.add(target)
172+
173+
174+
for filename in target_filenames:
175+
with open(filename, "r+", encoding="utf-8") as file:
176+
xhtml = file.read()
177+
new_xhtml = xhtml
178+
179+
# What language are we using?
180+
language = regex.search(r"<html[^>]+?xml:lang=\"([^\"]+)\"", xhtml)
181+
if language is None or (language.group(1) != "en-US" and language.group(1) != "en-GB"):
182+
if args.verbose:
183+
print("\n\t", end="", flush=True)
184+
se.print_error("No valid xml:lang attribute in <html> root. Only en-US and en-GB are supported. File: {}".format(filename))
185+
exit(1)
186+
187+
new_xhtml = modernize_spelling(new_xhtml, language.group(1))
188+
189+
if args.modernize_hyphenation:
190+
new_xhtml = modernize_hyphenation(new_xhtml, dictionary)
191+
192+
if new_xhtml != xhtml:
193+
file.seek(0)
194+
file.write(new_xhtml)
195+
file.truncate()
196+
197+
if args.verbose:
198+
print(" OK")
199+
203200

204201
if __name__ == "__main__":
205202
main()

reading-ease

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def get_word_count(text):
1717
return len(text.split())
1818

1919
def get_syllable_count(word):
20-
#see http://eayd.in/?p=232
20+
# See http://eayd.in/?p=232
2121
exception_add = ["serious", "crucial"]
2222
exception_del = ["fortunately", "unfortunately"]
2323

@@ -26,15 +26,15 @@ def get_syllable_count(word):
2626

2727
pre_one = ["preach"]
2828

29-
syls = 0 #added syllable number
30-
disc = 0 #discarded syllable number
29+
syls = 0 # Added syllable number
30+
disc = 0 # Discarded syllable number
3131

32-
#1) if letters < 3: return 1
32+
# 1) if letters < 3: return 1
3333
if len(word) <= 3:
3434
syls = 1
3535
return syls
3636

37-
#2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
37+
# 2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
3838
# if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)
3939
if word[-2:] == "es" or word[-2:] == "ed":
4040
double_and_triple_1 = len(regex.findall(r"[eaoui][eaoui]", word))
@@ -44,7 +44,7 @@ def get_syllable_count(word):
4444
else:
4545
disc += 1
4646

47-
#3) discard trailing "e", except where ending is "le"
47+
# 3) discard trailing "e", except where ending is "le"
4848
le_except = ["whole", "mobile", "pole", "male", "female", "hale", "pale", "tale", "sale", "aisle", "whale", "while"]
4949

5050
if word[-1:] == "e":
@@ -54,45 +54,45 @@ def get_syllable_count(word):
5454
else:
5555
disc += 1
5656

57-
#4) check if consecutive vowels exists, triplets or pairs, count them as one.
57+
# 4) check if consecutive vowels exists, triplets or pairs, count them as one.
5858
double_and_triple = len(regex.findall(r"[eaoui][eaoui]", word))
5959
tripple = len(regex.findall(r"[eaoui][eaoui][eaoui]", word))
6060
disc += double_and_triple + tripple
6161

62-
#5) count remaining vowels in word.
62+
# 5) count remaining vowels in word.
6363
num_vowels = len(regex.findall(r"[eaoui]", word))
6464

65-
#6) add one if starts with "mc"
65+
# 6) add one if starts with "mc"
6666
if word[:2] == "mc":
6767
syls += 1
6868

69-
#7) add one if ends with "y" but is not surrouned by vowel
69+
# 7) add one if ends with "y" but is not surrouned by vowel
7070
if word[-1:] == "y" and word[-2] not in "aeoui":
7171
syls += 1
7272

73-
#8) add one if "y" is surrounded by non-vowels and is not in the last word.
73+
# 8) add one if "y" is surrounded by non-vowels and is not in the last word.
7474
for i, j in enumerate(word):
7575
if j == "y":
7676
if (i != 0) and (i != len(word) - 1):
7777
if word[i - 1] not in "aeoui" and word[i + 1] not in "aeoui":
7878
syls += 1
7979

80-
#9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.
80+
# 9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.
8181
if word[:3] == "tri" and word[3] in "aeoui":
8282
syls += 1
8383

8484
if word[:2] == "bi" and word[2] in "aeoui":
8585
syls += 1
8686

87-
#10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"
87+
# 10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"
8888
if word[-3:] == "ian":
8989
#and (word[-4:] != "cian" or word[-4:] != "tian"):
9090
if word[-4:] == "cian" or word[-4:] == "tian":
9191
pass
9292
else:
9393
syls += 1
9494

95-
#11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
95+
# 11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
9696
if word[:2] == "co" and word[2] in "eaoui":
9797

9898
if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two:
@@ -102,14 +102,14 @@ def get_syllable_count(word):
102102
else:
103103
syls += 1
104104

105-
#12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
105+
# 12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
106106
if word[:3] == "pre" and word[3] in "eaoui":
107107
if word[:6] in pre_one:
108108
pass
109109
else:
110110
syls += 1
111111

112-
#13) check for "-n't" and cross match with dictionary to add syllable.
112+
# 13) check for "-n't" and cross match with dictionary to add syllable.
113113
negative = ["doesn't", "isn't", "shouldn't", "couldn't", "wouldn't"]
114114

115115
if word[-3:] == "n't":
@@ -118,14 +118,14 @@ def get_syllable_count(word):
118118
else:
119119
pass
120120

121-
#14) Handling the exceptional words.
121+
# 14) Handling the exceptional words.
122122
if word in exception_del:
123123
disc += 1
124124

125125
if word in exception_add:
126126
syls += 1
127127

128-
# calculate the output
128+
# Calculate the output
129129
return num_vowels - disc + syls
130130

131131
def main():

semanticate

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def main():
3333
xhtml = file.read()
3434
processed_xhtml = xhtml
3535

36-
#Some common abbreviations
36+
# Some common abbreviations
3737
processed_xhtml = regex.sub(r"(?<!\<abbr\>)Mr\.", r"<abbr>Mr.</abbr>", processed_xhtml)
3838
processed_xhtml = regex.sub(r"(?<!\<abbr\>)Mrs\.", r"<abbr>Mrs.</abbr>", processed_xhtml)
3939
processed_xhtml = regex.sub(r"(?<!\<abbr\>)Ms\.", r"<abbr>Ms.</abbr>", processed_xhtml)
@@ -77,21 +77,21 @@ def main():
7777
processed_xhtml = regex.sub(r"""(?<!\<abbr class="era"\>)B\.?C""", r"""<abbr class="era">BC</abbr>""", processed_xhtml)
7878
processed_xhtml = regex.sub(r"""(?<!\<abbr class="time"\>)([ap])\.\s?m\.""", r"""<abbr class="time">\1.m.</abbr>""", processed_xhtml)
7979

80-
#Guess at adding eoc class
80+
# Guess at adding eoc class
8181
processed_xhtml = regex.sub(r"""<abbr>([a-zA-Z\.]+?\.)</abbr></p>""", r"""<abbr class="eoc">\1</abbr></p>""", processed_xhtml)
8282
processed_xhtml = regex.sub(r"""<abbr>etc\.</abbr>(\s+[A-Z])""", r"""<abbr class="eoc">etc.</abbr>\1""", processed_xhtml)
8383

84-
#Clean up nesting errors
84+
# Clean up nesting errors
8585
processed_xhtml = regex.sub(r"""<abbr class="eoc"><abbr>([^<]+)</abbr></abbr>""", r"""<abbr class="eoc">\1</abbr>""", processed_xhtml)
8686

87-
#Get Roman numerals >= 2 characters
88-
#We only wrap these if they're standalone (i.e. not already wrapped in a tag) to prevent recursion in multiple runs
87+
# Get Roman numerals >= 2 characters
88+
# We only wrap these if they're standalone (i.e. not already wrapped in a tag) to prevent recursion in multiple runs
8989
processed_xhtml = regex.sub(r"([^a-zA-Z>])([ixvIXV]{2,})(\b)", r"""\1<span epub:type="z3998:roman">\2</span>\3""", processed_xhtml)
9090

91-
#Get Roman numerals that are X or V and single characters. We can't do I for obvious reasons.
91+
# Get Roman numerals that are X or V and single characters. We can't do I for obvious reasons.
9292
processed_xhtml = regex.sub(r"""([^a-zA-Z>\"])([vxVX])(\b)""", r"""\1<span epub:type="z3998:roman">\2</span>\3""", processed_xhtml)
9393

94-
#We may have added HTML tags within title tags. Remove those here
94+
# We may have added HTML tags within title tags. Remove those here
9595
soup = BeautifulSoup(processed_xhtml, "lxml")
9696
processed_xhtml = regex.sub(r"<title>.+?</title>", "<title>" + soup.title.text + "</title>", processed_xhtml)
9797

0 commit comments

Comments
 (0)