Changeset View
Changeset View
Standalone View
Standalone View
kde-modules/httpupdate.py
- This file was added.
Property | Old Value | New Value |
---|---|---|
File Mode | null | 100755 |
1 | #!/usr/bin/env python3 | ||||
---|---|---|---|---|---|
2 | | ||||
3 | # Copyright 2019 Sandro Knauß <sknauss@kde.org> | ||||
4 | # Copyright 2019 Volker Krause <vkrause@kde.org> | ||||
5 | # | ||||
6 | # Redistribution and use in source and binary forms, with or without | ||||
7 | # modification, are permitted provided that the following conditions | ||||
8 | # are met: | ||||
9 | # | ||||
10 | # 1. Redistributions of source code must retain the copyright | ||||
11 | # notice, this list of conditions and the following disclaimer. | ||||
12 | # 2. Redistributions in binary form must reproduce the copyright | ||||
13 | # notice, this list of conditions and the following disclaimer in the | ||||
14 | # documentation and/or other materials provided with the distribution. | ||||
15 | # 3. The name of the author may not be used to endorse or promote products | ||||
16 | # derived from this software without specific prior written permission. | ||||
17 | # | ||||
18 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR | ||||
19 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | ||||
20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | ||||
21 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, | ||||
22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||||
23 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||||
24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||||
25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||||
26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF | ||||
27 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
28 | | ||||
29 | import collections | ||||
30 | import dns.resolver | ||||
31 | import glob | ||||
32 | import fnmatch | ||||
33 | import multiprocessing | ||||
34 | import os | ||||
35 | import re | ||||
36 | import requests | ||||
37 | import socket | ||||
38 | import ssl | ||||
39 | import sys | ||||
40 | import urllib.parse | ||||
41 | import urllib.request | ||||
42 | | ||||
43 | RESOLVER = dns.resolver.Resolver() | ||||
44 | RESOLVER.nameservers=["1.1.1.1", "8.8.8.8"] | ||||
45 | | ||||
46 | class HTTPChecker: | ||||
47 | def __init__(self, blacklist): | ||||
48 | self.blacklist = blacklist | ||||
49 | self.reBlacklist = list(map(lambda r:re.compile(r,re.I), blacklist)) | ||||
50 | self.updater = re.compile(r'(https?://[^. \t\n"\'?<>*#()\\{}][^ \t\n"\'?<>*#()\\{},]+)', re.I) | ||||
51 | | ||||
52 | def urls(self, fpath): | ||||
53 | urls = set() | ||||
54 | with open(fpath, 'r') as f: | ||||
55 | linenumber = 0 | ||||
56 | for line in f: | ||||
57 | linenumber += 1 | ||||
58 | if any(map(lambda r: r.search(line), self.reBlacklist)): | ||||
59 | continue | ||||
60 | for link in self.updater.findall(line): | ||||
61 | urls.add(link) | ||||
62 | return urls | ||||
63 | | ||||
64 | class Url: | ||||
65 | def __init__(self, url): | ||||
66 | self.url = url | ||||
67 | m = re.match(r"^(https?)://([^/]+)((/.*)$|$)", self.url, re.I) | ||||
68 | self.protocol = m.group(1) | ||||
69 | self.domain = m.group(2) | ||||
70 | self.path = m.group(3) | ||||
71 | self.resolver = RESOLVER | ||||
72 | self.checkState = None | ||||
73 | | ||||
74 | def checkDns(self): | ||||
75 | try: | ||||
76 | self.resolver.query(self.domain, "A") | ||||
77 | return True | ||||
78 | except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.resolver.NoNameservers): | ||||
79 | pass | ||||
80 | | ||||
81 | try: | ||||
82 | self.resolver.query(self.domain, "AAAA") | ||||
83 | return True | ||||
84 | except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.resolver.NoNameservers): | ||||
85 | return False | ||||
86 | | ||||
87 | def checkUrl(self): | ||||
88 | try: | ||||
89 | r = requests.get(self.url, timeout=5) | ||||
90 | if r.status_code in (200, 400, 401, 403, 405, 406): | ||||
91 | return True | ||||
92 | except (ssl.SSLError): | ||||
93 | return False | ||||
94 | except UnicodeEncodeError: | ||||
95 | pass | ||||
96 | except (requests.ConnectionError, requests.Timeout): | ||||
97 | pass | ||||
98 | try: | ||||
99 | u = urllib.request.urlopen(self.url, timeout=5) | ||||
100 | if u.status == 200: | ||||
101 | return True | ||||
102 | except (ssl.SSLError): | ||||
103 | return False | ||||
104 | except UnicodeEncodeError: | ||||
105 | return True | ||||
106 | except (urllib.request.HTTPError, urllib.request.URLError, socket.timeout): | ||||
107 | return False | ||||
108 | return False | ||||
109 | | ||||
110 | @property | ||||
111 | def secureUrl(self): | ||||
112 | return "https://{}{}".format(self.domain, self.path) | ||||
113 | | ||||
114 | def checkSecureUrl(self): | ||||
115 | try: | ||||
116 | r = requests.get(self.secureUrl, timeout=5) | ||||
117 | if r.status_code in (200, 400, 401, 403, 405, 406): | ||||
118 | return True | ||||
119 | except (ssl.SSLError): | ||||
120 | return False | ||||
121 | except UnicodeEncodeError: | ||||
122 | pass | ||||
123 | except (requests.ConnectionError, requests.Timeout): | ||||
124 | pass | ||||
125 | try: | ||||
126 | u = urllib.request.urlopen(self.secureUrl, timeout=5) | ||||
127 | if u.status == 200: | ||||
128 | return True | ||||
129 | except (ssl.SSLError): | ||||
130 | return False | ||||
131 | except UnicodeEncodeError: | ||||
132 | return True | ||||
133 | except (urllib.request.HTTPError, urllib.request.URLError, socket.timeout): | ||||
134 | return False | ||||
135 | return False | ||||
136 | | ||||
137 | def check(self): | ||||
138 | if not self.checkDns(): | ||||
139 | self.checkState = "checkDns" | ||||
140 | return self.checkState | ||||
141 | | ||||
142 | if not self.checkUrl(): | ||||
143 | self.checkState = "checkUrl" | ||||
144 | return self.checkState | ||||
145 | | ||||
146 | if self.protocol == "http" and not self.checkSecureUrl(): | ||||
147 | self.checkState = "checkSecureUrl" | ||||
148 | return self.checkState | ||||
149 | | ||||
150 | self.checkState = True | ||||
151 | return self.checkState | ||||
152 | | ||||
153 | def check(url): | ||||
154 | url.check() | ||||
155 | return url | ||||
156 | | ||||
157 | def loadBlacklist(path, ignoreAuto=False): | ||||
158 | blacklist = [] | ||||
159 | try: | ||||
160 | with open(path) as f: | ||||
161 | comment = "" | ||||
162 | for line in f: | ||||
163 | m = re.match(r"^\s*#\s*(.*)", line) | ||||
164 | if m: | ||||
165 | comment = m.group(1).strip() | ||||
166 | continue | ||||
167 | | ||||
168 | if ignoreAuto and comment.startswith("Auto:"): | ||||
169 | comment = "" | ||||
170 | continue | ||||
171 | | ||||
172 | if line.strip(): | ||||
173 | blacklist.append(line.strip()) | ||||
174 | comment = "" | ||||
175 | except FileNotFoundError: | ||||
176 | pass | ||||
177 | return blacklist | ||||
178 | | ||||
179 | def getBlacklist(path, ignoreAuto=False): | ||||
180 | blacklist = [] | ||||
181 | for fname in glob.glob(os.path.join(os.path.dirname(__file__), "*.htignore")): | ||||
182 | if ignoreAuto and fname.endswith('reduce-warning.htignore'): | ||||
183 | continue | ||||
184 | blacklist += loadBlacklist(fname, ignoreAuto) | ||||
185 | return blacklist + loadBlacklist(path, ignoreAuto) | ||||
186 | | ||||
187 | class GitIgnore: | ||||
188 | def __init__(self, path): | ||||
189 | self.basepath = path | ||||
190 | self.patterns = [] | ||||
191 | try: | ||||
192 | with open(os.path.join(path, '.gitignore')) as f: | ||||
193 | for line in f: | ||||
194 | self.patterns.append(line.strip()) | ||||
195 | except FileNotFoundError: | ||||
196 | pass | ||||
197 | | ||||
198 | def match(self, path): | ||||
199 | for pattern in self.patterns: | ||||
200 | if fnmatch.fnmatch(os.path.relpath(path, self.basepath), pattern): | ||||
201 | return True | ||||
202 | return False | ||||
203 | | ||||
204 | def main(path): | ||||
205 | blacklist = getBlacklist(os.path.join(path, '.htignore'), ignoreAuto=True) | ||||
206 | checker = HTTPChecker(blacklist) | ||||
207 | gitIgnore = GitIgnore(path) | ||||
208 | gitIgnore.patterns.append('.htignore') | ||||
209 | gitIgnore.patterns.append('.gitignore') | ||||
210 | urls = collections.defaultdict(set) | ||||
211 | for dirpath, dirnames, filenames in os.walk(path): | ||||
212 | parts = dirpath.split("/") | ||||
213 | if any(map(lambda p: p in parts, ['.git', 'tests', 'autotests', '3rdparty'])): | ||||
214 | continue | ||||
215 | for fname in filenames: | ||||
216 | fpath = os.path.join(dirpath, fname) | ||||
217 | if gitIgnore.match(fpath): | ||||
218 | continue | ||||
219 | try: | ||||
220 | u = checker.urls(fpath) | ||||
221 | for url in u: | ||||
222 | urls[url].add(fpath) | ||||
223 | except UnicodeDecodeError: | ||||
224 | pass | ||||
225 | | ||||
226 | manual_overwrites = [] | ||||
227 | try: | ||||
228 | with open(os.path.join(path, '.htignore')) as f: | ||||
229 | comment = None | ||||
230 | for line in f: | ||||
231 | m = re.match(r"^\s*#\s*(.*)", line) | ||||
232 | if m: | ||||
233 | comment = m.group(1).strip() | ||||
234 | continue | ||||
235 | | ||||
236 | if comment and comment.startswith("Auto:"): | ||||
237 | comment = None | ||||
238 | continue | ||||
239 | | ||||
240 | if line.strip(): | ||||
241 | manual_overwrites.append((line.strip(), comment)) | ||||
242 | comment = None | ||||
243 | | ||||
244 | except FileNotFoundError: | ||||
245 | pass | ||||
246 | | ||||
247 | auto_overwrites = [] | ||||
248 | with multiprocessing.Pool(processes=40) as pool: | ||||
249 | for url in pool.imap_unordered(check, [Url(u) for u in urls.keys()]): | ||||
250 | updateResult = url.checkState | ||||
251 | if updateResult == True and url.protocol == "http": | ||||
252 | for fpath in urls[url.url]: | ||||
253 | print(f"updating {fpath}") | ||||
254 | with open(fpath, 'r') as f: | ||||
255 | content = f.read() | ||||
256 | with open(fpath, 'w') as f: | ||||
257 | f.write(re.sub(url.url.replace(".","\."),url.secureUrl, content, flags=re.I)) | ||||
258 | elif updateResult != True: | ||||
259 | if updateResult == "checkDns": | ||||
260 | description = "No DNS response" | ||||
261 | if updateResult == "checkUrl": | ||||
262 | description = "URL seems dead" | ||||
263 | if updateResult == "checkSecureUrl": | ||||
264 | description = "No https alternative" | ||||
265 | l = "\n\t".join(urls[url.url]) | ||||
266 | print(f"'{description}' '{url.url}' found in:\n\t{l}") | ||||
267 | if url.protocol == "http": | ||||
268 | auto_overwrites.append((url.url.replace(".","\."), f"Auto: {description}")) | ||||
269 | | ||||
270 | if not manual_overwrites and not auto_overwrites: | ||||
271 | return | ||||
272 | | ||||
273 | with open(os.path.join(path, '.htignore'), 'w') as ow: | ||||
274 | if manual_overwrites: | ||||
275 | ow.write("\n".join([formatOverwrite(i) for i in manual_overwrites])) | ||||
276 | ow.write("\n") | ||||
277 | | ||||
278 | if auto_overwrites: | ||||
279 | ow.write("\n".join([formatOverwrite(i) for i in sorted(auto_overwrites)])) | ||||
280 | ow.write("\n") | ||||
281 | | ||||
282 | def formatOverwrite(entry): | ||||
283 | if entry[1]: | ||||
284 | return f"# {entry[1]}\n{entry[0]}" | ||||
285 | else: | ||||
286 | return entry[0] | ||||
287 | | ||||
288 | | ||||
289 | if __name__ == "__main__": | ||||
290 | path = "." | ||||
291 | if len(sys.argv) > 1: | ||||
292 | path = sys.argv[1] | ||||
293 | urls = main(path) |