Changeset View
Changeset View
Standalone View
Standalone View
src/lib/3rdparty/markdown-tokens.cpp
- This file was added.
1 | | ||||
---|---|---|---|---|---|
2 | /* | ||||
3 | Copyright (c) 2009 by Chad Nelson | ||||
4 | Released under the MIT License. | ||||
5 | See the provided LICENSE.TXT file for details. | ||||
6 | */ | ||||
7 | | ||||
8 | #include "markdown-tokens.h" | ||||
9 | | ||||
10 | #include <stack> | ||||
11 | | ||||
12 | #include <boost/lexical_cast.hpp> | ||||
13 | #include <boost/regex.hpp> | ||||
14 | #include <boost/unordered_set.hpp> | ||||
15 | | ||||
16 | using std::cerr; | ||||
17 | using std::endl; | ||||
18 | | ||||
19 | namespace markdown { | ||||
20 | namespace token { | ||||
21 | | ||||
22 | namespace { | ||||
23 | | ||||
24 | const std::string cEscapedCharacters("\\`*_{}[]()#+-.!>"); | ||||
25 | | ||||
26 | optional<size_t> isEscapedCharacter(char c) { | ||||
27 | std::string::const_iterator i=std::find(cEscapedCharacters.begin(), | ||||
28 | cEscapedCharacters.end(), c); | ||||
29 | if (i!=cEscapedCharacters.end()) | ||||
30 | return std::distance(cEscapedCharacters.begin(), i); | ||||
31 | else return none; | ||||
32 | } | ||||
33 | | ||||
34 | char escapedCharacter(size_t index) { | ||||
35 | return cEscapedCharacters[index]; | ||||
36 | } | ||||
37 | | ||||
38 | std::string encodeString(const std::string& src, int encodingFlags) { | ||||
39 | bool amps=(encodingFlags & cAmps)!=0, | ||||
40 | doubleAmps=(encodingFlags & cDoubleAmps)!=0, | ||||
41 | angleBrackets=(encodingFlags & cAngles)!=0, | ||||
42 | quotes=(encodingFlags & cQuotes)!=0; | ||||
43 | | ||||
44 | std::string tgt; | ||||
45 | for (std::string::const_iterator i=src.begin(), ie=src.end(); i!=ie; ++i) { | ||||
46 | if (*i=='&' && amps) { | ||||
47 | static const boost::regex cIgnore("^(&)|(&#[0-9]{1,3};)|(&#[xX][0-9a-fA-F]{1,2};)"); | ||||
48 | if (boost::regex_search(i, ie, cIgnore)) { | ||||
49 | tgt.push_back(*i); | ||||
50 | } else { | ||||
51 | tgt+="&"; | ||||
52 | } | ||||
53 | } | ||||
54 | else if (*i=='&' && doubleAmps) tgt+="&"; | ||||
55 | else if (*i=='<' && angleBrackets) tgt+="<"; | ||||
56 | else if (*i=='>' && angleBrackets) tgt+=">"; | ||||
57 | else if (*i=='\"' && quotes) tgt+="""; | ||||
58 | else tgt.push_back(*i); | ||||
59 | } | ||||
60 | return tgt; | ||||
61 | } | ||||
62 | | ||||
63 | bool looksLikeUrl(const std::string& str) { | ||||
64 | const char *schemes[]={ "http://", "https://", "ftp://", "ftps://", | ||||
65 | "file://", "www.", "ftp.", 0 }; | ||||
66 | for (size_t x=0; schemes[x]!=0; ++x) { | ||||
67 | const char *s=str.c_str(), *t=schemes[x]; | ||||
68 | while (*s!=0 && *t!=0 && *s==*t) { ++s; ++t; } | ||||
69 | if (*t==0) return true; | ||||
70 | } | ||||
71 | return false; | ||||
72 | } | ||||
73 | | ||||
74 | bool notValidNameCharacter(char c) { | ||||
75 | return !(isalnum(c) || c=='.' || c=='_' || c=='%' || c=='-' || c=='+'); | ||||
76 | } | ||||
77 | | ||||
78 | bool notValidSiteCharacter(char c) { | ||||
79 | // NOTE: Kludge alert! The official spec for site characters is only | ||||
80 | // "a-zA-Z._%-". However, MDTest supports "international domain names," | ||||
81 | // which use characters other than that; I'm kind of cheating here, handling | ||||
82 | // those by allowing all utf8-encoded characters too. | ||||
83 | return !(isalnum(c) || c=='.' || c=='_' || c=='%' || c=='-' || (c & 0x80)); | ||||
84 | } | ||||
85 | | ||||
86 | bool isNotAlpha(char c) { | ||||
87 | return !isalpha(c); | ||||
88 | } | ||||
89 | | ||||
90 | std::string emailEncode(const std::string& src) { | ||||
91 | std::ostringstream out; | ||||
92 | bool inHex=false; | ||||
93 | for (std::string::const_iterator i=src.begin(), ie=src.end(); i!=ie; | ||||
94 | ++i) | ||||
95 | { | ||||
96 | if (*i & 0x80) out << *i; | ||||
97 | else if (inHex) { | ||||
98 | out << "&#x" << std::hex << static_cast<int>(*i) << ';'; | ||||
99 | } else { | ||||
100 | out << "&#" << std::dec << static_cast<int>(*i) << ';'; | ||||
101 | } | ||||
102 | inHex=!inHex; | ||||
103 | } | ||||
104 | return out.str(); | ||||
105 | } | ||||
106 | | ||||
107 | bool looksLikeEmailAddress(const std::string& str) { | ||||
108 | typedef std::string::const_iterator Iter; | ||||
109 | typedef std::string::const_reverse_iterator RIter; | ||||
110 | Iter i=std::find_if(str.begin(), str.end(), notValidNameCharacter); | ||||
111 | if (i!=str.end() && *i=='@' && i!=str.begin()) { | ||||
112 | // The name part is valid. | ||||
113 | i=std::find_if(i+1, str.end(), notValidSiteCharacter); | ||||
114 | if (i==str.end()) { | ||||
115 | // The site part doesn't contain any invalid characters. | ||||
116 | RIter ri=std::find_if(str.rbegin(), str.rend(), isNotAlpha); | ||||
117 | if (ri!=str.rend() && *ri=='.') { | ||||
118 | // It ends with a dot and only alphabetic characters. | ||||
119 | size_t d=std::distance(ri.base(), str.end()); | ||||
120 | if (d>=2 && d<=4) { | ||||
121 | // There are two-to-four of them. It's valid. | ||||
122 | return true; | ||||
123 | } | ||||
124 | } | ||||
125 | } | ||||
126 | } | ||||
127 | return false; | ||||
128 | } | ||||
129 | | ||||
130 | // From <http://en.wikipedia.org/wiki/HTML_element> | ||||
131 | | ||||
132 | const char *cOtherTagInit[]={ | ||||
133 | // Header tags | ||||
134 | "title/", "base", "link", "basefont", "script/", "style/", | ||||
135 | "object/", "meta", | ||||
136 | | ||||
137 | // Inline tags | ||||
138 | "em/", "strong/", "q/", "cite/", "dfn/", "abbr/", "acronym/", | ||||
139 | "code/", "samp/", "kbd/", "var/", "sub/", "sup/", "del/", "ins/", | ||||
140 | "isindex", "a/", "img", "br", "map/", "area", "object/", "param", | ||||
141 | "applet/", "span/", | ||||
142 | | ||||
143 | 0 }; | ||||
144 | | ||||
145 | const char *cBlockTagInit[]={ "p/", "blockquote/", "hr", "h1/", "h2/", | ||||
146 | "h3/", "h4/", "h5/", "h6/", "dl/", "dt/", "dd/", "ol/", "ul/", | ||||
147 | "li/", "dir/", "menu/", "table/", "tr/", "th/", "td/", "col", | ||||
148 | "colgroup/", "caption/", "thead/", "tbody/", "tfoot/", "form/", | ||||
149 | "select/", "option", "input", "label/", "textarea/", "div/", "pre/", | ||||
150 | "address/", "iframe/", "frame/", "frameset/", "noframes/", | ||||
151 | "center/", "b/", "i/", "big/", "small/", /*"s/",*/ "strike/", "tt/", | ||||
152 | "u/", "font/", "ins/", "del/", 0 }; | ||||
153 | | ||||
154 | // Other official ones (not presently in use in this code) | ||||
155 | //"!doctype", "bdo", "body", "button", "fieldset", "head", "html", | ||||
156 | //"legend", "noscript", "optgroup", "xmp", | ||||
157 | | ||||
158 | boost::unordered_set<std::string> otherTags, blockTags; | ||||
159 | | ||||
160 | void initTag(boost::unordered_set<std::string> &set, const char *init[]) { | ||||
161 | for (size_t x=0; init[x]!=0; ++x) { | ||||
162 | std::string str=init[x]; | ||||
163 | if (*str.rbegin()=='/') { | ||||
164 | // Means it can have a closing tag | ||||
165 | str=str.substr(0, str.length()-1); | ||||
166 | } | ||||
167 | set.insert(str); | ||||
168 | } | ||||
169 | } | ||||
170 | | ||||
171 | std::string cleanTextLinkRef(const std::string& ref) { | ||||
172 | std::string r; | ||||
173 | for (std::string::const_iterator i=ref.begin(), ie=ref.end(); i!=ie; | ||||
174 | ++i) | ||||
175 | { | ||||
176 | if (*i==' ') { | ||||
177 | if (r.empty() || *r.rbegin()!=' ') r.push_back(' '); | ||||
178 | } else r.push_back(*i); | ||||
179 | } | ||||
180 | return r; | ||||
181 | } | ||||
182 | | ||||
183 | } // namespace | ||||
184 | | ||||
185 | | ||||
186 | | ||||
187 | size_t isValidTag(const std::string& tag, bool nonBlockFirst) { | ||||
188 | if (blockTags.empty()) { | ||||
189 | initTag(otherTags, cOtherTagInit); | ||||
190 | initTag(blockTags, cBlockTagInit); | ||||
191 | } | ||||
192 | | ||||
193 | if (nonBlockFirst) { | ||||
194 | if (otherTags.find(tag)!=otherTags.end()) return 1; | ||||
195 | if (blockTags.find(tag)!=blockTags.end()) return 2; | ||||
196 | } else { | ||||
197 | if (blockTags.find(tag)!=blockTags.end()) return 2; | ||||
198 | if (otherTags.find(tag)!=otherTags.end()) return 1; | ||||
199 | } | ||||
200 | return 0; | ||||
201 | } | ||||
202 | | ||||
203 | | ||||
204 | | ||||
205 | void TextHolder::writeAsHtml(std::ostream& out) const { | ||||
206 | preWrite(out); | ||||
207 | if (mEncodingFlags!=0) { | ||||
208 | out << encodeString(mText, mEncodingFlags); | ||||
209 | } else { | ||||
210 | out << mText; | ||||
211 | } | ||||
212 | postWrite(out); | ||||
213 | } | ||||
214 | | ||||
215 | optional<TokenGroup> RawText::processSpanElements(const LinkIds& idTable) { | ||||
216 | if (!canContainMarkup()) return none; | ||||
217 | | ||||
218 | ReplacementTable replacements; | ||||
219 | std::string str=_processHtmlTagAttributes(*text(), replacements); | ||||
220 | str=_processCodeSpans(str, replacements); | ||||
221 | str=_processEscapedCharacters(str); | ||||
222 | str=_processLinksImagesAndTags(str, replacements, idTable); | ||||
223 | return _processBoldAndItalicSpans(str, replacements); | ||||
224 | } | ||||
225 | | ||||
226 | std::string RawText::_processHtmlTagAttributes(std::string src, ReplacementTable& | ||||
227 | replacements) | ||||
228 | { | ||||
229 | // Because "Attribute Content Is Not A Code Span" | ||||
230 | std::string tgt; | ||||
231 | std::string::const_iterator prev=src.begin(), end=src.end(); | ||||
232 | while (1) { | ||||
233 | static const boost::regex cHtmlToken("<((/?)([a-zA-Z0-9]+)(?:( +[a-zA-Z0-9]+?(?: ?= ?(\"|').*?\\5))+? */? *))>"); | ||||
234 | boost::smatch m; | ||||
235 | if (boost::regex_search(prev, end, m, cHtmlToken)) { | ||||
236 | // NOTE: Kludge alert! The `isValidTag` test is a cheat, only here | ||||
237 | // to handle some edge cases between the Markdown test suite and the | ||||
238 | // PHP-Markdown one, which seem to conflict. | ||||
239 | if (isValidTag(m[3])) { | ||||
240 | tgt+=std::string(prev, m[0].first); | ||||
241 | | ||||
242 | std::string fulltag=m[0], tgttag; | ||||
243 | std::string::const_iterator prevtag=fulltag.begin(), endtag=fulltag.end(); | ||||
244 | while (1) { | ||||
245 | static const boost::regex cAttributeStrings("= ?(\"|').*?\\1"); | ||||
246 | boost::smatch mtag; | ||||
247 | if (boost::regex_search(prevtag, endtag, mtag, cAttributeStrings)) { | ||||
248 | tgttag+=std::string(prevtag, mtag[0].first); | ||||
249 | tgttag+="\x01@"+boost::lexical_cast<std::string>(replacements.size())+"@htmlTagAttr\x01"; | ||||
250 | prevtag=mtag[0].second; | ||||
251 | | ||||
252 | replacements.push_back(TokenPtr(new TextHolder(std::string(mtag[0]), false, cAmps|cAngles))); | ||||
253 | } else { | ||||
254 | tgttag+=std::string(prevtag, endtag); | ||||
255 | break; | ||||
256 | } | ||||
257 | } | ||||
258 | tgt+=tgttag; | ||||
259 | prev=m[0].second; | ||||
260 | } else { | ||||
261 | tgt+=std::string(prev, m[0].second); | ||||
262 | prev=m[0].second; | ||||
263 | } | ||||
264 | } else { | ||||
265 | tgt+=std::string(prev, end); | ||||
266 | break; | ||||
267 | } | ||||
268 | } | ||||
269 | | ||||
270 | return tgt; | ||||
271 | } | ||||
272 | | ||||
273 | std::string RawText::_processCodeSpans(std::string src, ReplacementTable& | ||||
274 | replacements) | ||||
275 | { | ||||
276 | static const boost::regex cCodeSpan[2]={ | ||||
277 | boost::regex("(?:^|(?<=[^\\\\]))`` (.+?) ``"), | ||||
278 | boost::regex("(?:^|(?<=[^\\\\]))`(.+?)`") | ||||
279 | }; | ||||
280 | for (int pass=0; pass<2; ++pass) { | ||||
281 | std::string tgt; | ||||
282 | std::string::const_iterator prev=src.begin(), end=src.end(); | ||||
283 | while (1) { | ||||
284 | boost::smatch m; | ||||
285 | if (boost::regex_search(prev, end, m, cCodeSpan[pass])) { | ||||
286 | tgt+=std::string(prev, m[0].first); | ||||
287 | tgt+="\x01@"+boost::lexical_cast<std::string>(replacements.size())+"@codeSpan\x01"; | ||||
288 | prev=m[0].second; | ||||
289 | replacements.push_back(TokenPtr(new CodeSpan(_restoreProcessedItems(m[1], replacements)))); | ||||
290 | } else { | ||||
291 | tgt+=std::string(prev, end); | ||||
292 | break; | ||||
293 | } | ||||
294 | } | ||||
295 | src.swap(tgt); | ||||
296 | tgt.clear(); | ||||
297 | } | ||||
298 | return src; | ||||
299 | } | ||||
300 | | ||||
301 | std::string RawText::_processEscapedCharacters(const std::string& src) { | ||||
302 | std::string tgt; | ||||
303 | std::string::const_iterator prev=src.begin(), end=src.end(); | ||||
304 | while (1) { | ||||
305 | std::string::const_iterator i=std::find(prev, end, '\\'); | ||||
306 | if (i!=end) { | ||||
307 | tgt+=std::string(prev, i); | ||||
308 | ++i; | ||||
309 | if (i!=end) { | ||||
310 | optional<size_t> e=isEscapedCharacter(*i); | ||||
311 | if (e) tgt+="\x01@#"+boost::lexical_cast<std::string>(*e)+"@escaped\x01"; | ||||
312 | else tgt=tgt+'\\'+*i; | ||||
313 | prev=i+1; | ||||
314 | } else { | ||||
315 | tgt+='\\'; | ||||
316 | break; | ||||
317 | } | ||||
318 | } else { | ||||
319 | tgt+=std::string(prev, end); | ||||
320 | break; | ||||
321 | } | ||||
322 | } | ||||
323 | return tgt; | ||||
324 | } | ||||
325 | | ||||
326 | std::string RawText::_processSpaceBracketedGroupings(const std::string &src, | ||||
327 | ReplacementTable& replacements) | ||||
328 | { | ||||
329 | static const boost::regex cRemove("(?:(?: \\*+ )|(?: _+ ))"); | ||||
330 | | ||||
331 | std::string tgt; | ||||
332 | std::string::const_iterator prev=src.begin(), end=src.end(); | ||||
333 | while (1) { | ||||
334 | boost::smatch m; | ||||
335 | if (boost::regex_search(prev, end, m, cRemove)) { | ||||
336 | tgt+=std::string(prev, m[0].first); | ||||
337 | tgt+="\x01@"+boost::lexical_cast<std::string>(replacements.size())+"@spaceBracketed\x01"; | ||||
338 | replacements.push_back(TokenPtr(new RawText(m[0]))); | ||||
339 | prev=m[0].second; | ||||
340 | } else { | ||||
341 | tgt+=std::string(prev, end); | ||||
342 | break; | ||||
343 | } | ||||
344 | } | ||||
345 | return tgt; | ||||
346 | } | ||||
347 | | ||||
348 | std::string RawText::_processLinksImagesAndTags(const std::string &src, | ||||
349 | ReplacementTable& replacements, const LinkIds& idTable) | ||||
350 | { | ||||
351 | // NOTE: Kludge alert! The "inline link or image" regex should be... | ||||
352 | // | ||||
353 | // "(?:(!?)\\[(.+?)\\] *\\((.*?)\\))" | ||||
354 | // | ||||
355 | // ...but that fails on the 'Images' test because it includes a "stupid URL" | ||||
356 | // that has parentheses within it. The proper way to deal with this would be | ||||
357 | // to match any nested parentheses, but regular expressions can't handle an | ||||
358 | // unknown number of nested items, so I'm cheating -- the regex for it | ||||
359 | // allows for one (and *only* one) pair of matched parentheses within the | ||||
360 | // URL. It makes the regex hard to follow (it was even harder to get right), | ||||
361 | // but it allows it to pass the test. | ||||
362 | // | ||||
363 | // The "reference link or image" one has a similar problem; it should be... | ||||
364 | // | ||||
365 | // "|(?:(!?)\\[(.+?)\\](?: *\\[(.*?)\\])?)" | ||||
366 | // | ||||
367 | static const boost::regex cExpression( | ||||
368 | "(?:(!?)\\[([^\\]]+?)\\] *\\(([^\\(]*(?:\\(.*?\\).*?)*?)\\))" // Inline link or image | ||||
369 | "|(?:(!?)\\[((?:[^]]*?\\[.*?\\].*?)|(?:.+?))\\](?: *\\[(.*?)\\])?)" // Reference link or image | ||||
370 | "|(?:<(/?([a-zA-Z0-9]+).*?)>)" // potential HTML tag or auto-link | ||||
371 | ); | ||||
372 | // Important captures: 1/4=image indicator, 2/5=contents/alttext, | ||||
373 | // 3=URL/title, 6=optional link ID, 7=potential HTML tag or auto-link | ||||
374 | // contents, 8=actual tag from 7. | ||||
375 | | ||||
376 | std::string tgt; | ||||
377 | std::string::const_iterator prev=src.begin(), end=src.end(); | ||||
378 | while (1) { | ||||
379 | boost::smatch m; | ||||
380 | if (boost::regex_search(prev, end, m, cExpression)) { | ||||
381 | assert(m[0].matched); | ||||
382 | assert(m[0].length()!=0); | ||||
383 | | ||||
384 | tgt+=std::string(prev, m[0].first); | ||||
385 | tgt+="\x01@"+boost::lexical_cast<std::string>(replacements.size())+"@links&Images1\x01"; | ||||
386 | prev=m[0].second; | ||||
387 | | ||||
388 | bool isImage=false, isLink=false, isReference=false; | ||||
389 | if (m[4].matched && m[4].length()) isImage=isReference=true; | ||||
390 | else if (m[1].matched && m[1].length()) isImage=true; | ||||
391 | else if (m[5].matched) isLink=isReference=true; | ||||
392 | else if (m[2].matched) isLink=true; | ||||
393 | | ||||
394 | if (isImage || isLink) { | ||||
395 | std::string contentsOrAlttext, url, title; | ||||
396 | bool resolved=false; | ||||
397 | if (isReference) { | ||||
398 | contentsOrAlttext=m[5]; | ||||
399 | std::string linkId=(m[6].matched ? std::string(m[6]) : std::string()); | ||||
400 | if (linkId.empty()) linkId=cleanTextLinkRef(contentsOrAlttext); | ||||
401 | | ||||
402 | optional<markdown::LinkIds::Target> target=idTable.find(linkId); | ||||
403 | if (target) { url=target->url; title=target->title; resolved=true; }; | ||||
404 | } else { | ||||
405 | static const boost::regex cReference("^<?([^ >]*)>?(?: *(?:('|\")(.*)\\2)|(?:\\((.*)\\)))? *$"); | ||||
406 | // Useful captures: 1=url, 3/4=title | ||||
407 | contentsOrAlttext=m[2]; | ||||
408 | std::string urlAndTitle=m[3]; | ||||
409 | boost::smatch mm; | ||||
410 | if (boost::regex_match(urlAndTitle, mm, cReference)) { | ||||
411 | url=mm[1]; | ||||
412 | if (mm[3].matched) title=mm[3]; | ||||
413 | else if (mm[4].matched) title=mm[4]; | ||||
414 | resolved=true; | ||||
415 | } | ||||
416 | } | ||||
417 | | ||||
418 | if (!resolved) { | ||||
419 | // Just encode the first character as-is, and continue | ||||
420 | // searching after it. | ||||
421 | prev=m[0].first+1; | ||||
422 | replacements.push_back(TokenPtr(new RawText(std::string(m[0].first, prev)))); | ||||
423 | } else if (isImage) { | ||||
424 | replacements.push_back(TokenPtr(new Image(contentsOrAlttext, | ||||
425 | url, title))); | ||||
426 | } else { | ||||
427 | replacements.push_back(TokenPtr(new HtmlAnchorTag(url, title))); | ||||
428 | tgt+=contentsOrAlttext; | ||||
429 | tgt+="\x01@"+boost::lexical_cast<std::string>(replacements.size())+"@links&Images2\x01"; | ||||
430 | replacements.push_back(TokenPtr(new HtmlTag("/a"))); | ||||
431 | } | ||||
432 | } else { | ||||
433 | // Otherwise it's an HTML tag or auto-link. | ||||
434 | std::string contents=m[7]; | ||||
435 | | ||||
436 | // cerr << "Evaluating potential HTML or auto-link: " << contents << endl; | ||||
437 | // cerr << "m[8]=" << m[8] << endl; | ||||
438 | | ||||
439 | if (looksLikeUrl(contents)) { | ||||
440 | TokenGroup subgroup; | ||||
441 | subgroup.push_back(TokenPtr(new HtmlAnchorTag(contents))); | ||||
442 | subgroup.push_back(TokenPtr(new RawText(contents, false))); | ||||
443 | subgroup.push_back(TokenPtr(new HtmlTag("/a"))); | ||||
444 | replacements.push_back(TokenPtr(new Container(subgroup))); | ||||
445 | } else if (looksLikeEmailAddress(contents)) { | ||||
446 | TokenGroup subgroup; | ||||
447 | subgroup.push_back(TokenPtr(new HtmlAnchorTag(emailEncode("mailto:"+contents)))); | ||||
448 | subgroup.push_back(TokenPtr(new RawText(emailEncode(contents), false))); | ||||
449 | subgroup.push_back(TokenPtr(new HtmlTag("/a"))); | ||||
450 | replacements.push_back(TokenPtr(new Container(subgroup))); | ||||
451 | } else if (isValidTag(m[8])) { | ||||
452 | replacements.push_back(TokenPtr(new HtmlTag(_restoreProcessedItems(contents, replacements)))); | ||||
453 | } else { | ||||
454 | // Just encode it as-is | ||||
455 | replacements.push_back(TokenPtr(new RawText(m[0]))); | ||||
456 | } | ||||
457 | } | ||||
458 | } else { | ||||
459 | tgt+=std::string(prev, end); | ||||
460 | break; | ||||
461 | } | ||||
462 | } | ||||
463 | return tgt; | ||||
464 | } | ||||
465 | | ||||
466 | TokenGroup RawText::_processBoldAndItalicSpans(const std::string& src, | ||||
467 | ReplacementTable& replacements) | ||||
468 | { | ||||
469 | static const boost::regex cEmphasisExpression( | ||||
470 | "(?:(?<![*_])([*_]{1,3})([^*_ ]+?)\\1(?![*_]))" // Mid-word emphasis | ||||
471 | "|((?:(?<!\\*)\\*{1,3}(?!\\*)|(?<!_)_{1,3}(?!_))(?=.)(?! )(?![.,:;] )(?![.,:;]$))" // Open | ||||
472 | "|((?<![* ])\\*{1,3}(?!\\*)|(?<![ _])_{1,3}(?!_))" // Close | ||||
473 | ); | ||||
474 | | ||||
475 | TokenGroup tgt; | ||||
476 | std::string::const_iterator i=src.begin(), end=src.end(), prev=i; | ||||
477 | | ||||
478 | while (1) { | ||||
479 | boost::smatch m; | ||||
480 | if (boost::regex_search(prev, end, m, cEmphasisExpression)) { | ||||
481 | if (prev!=m[0].first) tgt.push_back(TokenPtr(new | ||||
482 | RawText(std::string(prev, m[0].first)))); | ||||
483 | if (m[3].matched) { | ||||
484 | std::string token=m[3]; | ||||
485 | tgt.push_back(TokenPtr(new BoldOrItalicMarker(true, token[0], | ||||
486 | token.length()))); | ||||
487 | prev=m[0].second; | ||||
488 | } else if (m[4].matched) { | ||||
489 | std::string token=m[4]; | ||||
490 | tgt.push_back(TokenPtr(new BoldOrItalicMarker(false, token[0], | ||||
491 | token.length()))); | ||||
492 | prev=m[0].second; | ||||
493 | } else { | ||||
494 | std::string token=m[1], contents=m[2]; | ||||
495 | tgt.push_back(TokenPtr(new BoldOrItalicMarker(true, token[0], | ||||
496 | token.length()))); | ||||
497 | tgt.push_back(TokenPtr(new RawText(std::string(contents)))); | ||||
498 | tgt.push_back(TokenPtr(new BoldOrItalicMarker(false, token[0], | ||||
499 | token.length()))); | ||||
500 | prev=m[0].second; | ||||
501 | } | ||||
502 | } else { | ||||
503 | if (prev!=end) tgt.push_back(TokenPtr(new RawText(std::string(prev, | ||||
504 | end)))); | ||||
505 | break; | ||||
506 | } | ||||
507 | } | ||||
508 | | ||||
509 | int id=0; | ||||
510 | for (TokenGroup::iterator ii=tgt.begin(), iie=tgt.end(); ii!=iie; ++ii) { | ||||
511 | if ((*ii)->isUnmatchedOpenMarker()) { | ||||
512 | BoldOrItalicMarker *openToken=dynamic_cast<BoldOrItalicMarker*>(ii->get()); | ||||
513 | | ||||
514 | // Find a matching close-marker, if it's there | ||||
515 | TokenGroup::iterator iii=ii; | ||||
516 | for (++iii; iii!=iie; ++iii) { | ||||
517 | if ((*iii)->isUnmatchedCloseMarker()) { | ||||
518 | BoldOrItalicMarker *closeToken=dynamic_cast<BoldOrItalicMarker*>(iii->get()); | ||||
519 | if (closeToken->size()==3 && openToken->size()!=3) { | ||||
520 | // Split the close-token into a match for the open-token | ||||
521 | // and a second for the leftovers. | ||||
522 | closeToken->disable(); | ||||
523 | TokenGroup g; | ||||
524 | g.push_back(TokenPtr(new BoldOrItalicMarker(false, | ||||
525 | closeToken->tokenCharacter(), closeToken->size()- | ||||
526 | openToken->size()))); | ||||
527 | g.push_back(TokenPtr(new BoldOrItalicMarker(false, | ||||
528 | closeToken->tokenCharacter(), openToken->size()))); | ||||
529 | TokenGroup::iterator after=iii; | ||||
530 | ++after; | ||||
531 | tgt.splice(after, g); | ||||
532 | continue; | ||||
533 | } | ||||
534 | | ||||
535 | if (closeToken->tokenCharacter()==openToken->tokenCharacter() | ||||
536 | && closeToken->size()==openToken->size()) | ||||
537 | { | ||||
538 | openToken->matched(closeToken, id); | ||||
539 | closeToken->matched(openToken, id); | ||||
540 | ++id; | ||||
541 | break; | ||||
542 | } else if (openToken->size()==3) { | ||||
543 | // Split the open-token into a match for the close-token | ||||
544 | // and a second for the leftovers. | ||||
545 | openToken->disable(); | ||||
546 | TokenGroup g; | ||||
547 | g.push_back(TokenPtr(new BoldOrItalicMarker(true, | ||||
548 | openToken->tokenCharacter(), openToken->size()- | ||||
549 | closeToken->size()))); | ||||
550 | g.push_back(TokenPtr(new BoldOrItalicMarker(true, | ||||
551 | openToken->tokenCharacter(), closeToken->size()))); | ||||
552 | TokenGroup::iterator after=ii; | ||||
553 | ++after; | ||||
554 | tgt.splice(after, g); | ||||
555 | break; | ||||
556 | } | ||||
557 | } | ||||
558 | } | ||||
559 | } | ||||
560 | } | ||||
561 | | ||||
562 | // "Unmatch" invalidly-nested matches. | ||||
563 | std::stack<BoldOrItalicMarker*> openMatches; | ||||
564 | for (TokenGroup::iterator ii=tgt.begin(), iie=tgt.end(); ii!=iie; ++ii) { | ||||
565 | if ((*ii)->isMatchedOpenMarker()) { | ||||
566 | BoldOrItalicMarker *open=dynamic_cast<BoldOrItalicMarker*>(ii->get()); | ||||
567 | openMatches.push(open); | ||||
568 | } else if ((*ii)->isMatchedCloseMarker()) { | ||||
569 | BoldOrItalicMarker *close=dynamic_cast<BoldOrItalicMarker*>(ii->get()); | ||||
570 | | ||||
571 | if (close->id() != openMatches.top()->id()) { | ||||
572 | close->matchedTo()->matched(0); | ||||
573 | close->matched(0); | ||||
574 | } else { | ||||
575 | openMatches.pop(); | ||||
576 | while (!openMatches.empty() && openMatches.top()->matchedTo()==0) | ||||
577 | openMatches.pop(); | ||||
578 | } | ||||
579 | } | ||||
580 | } | ||||
581 | | ||||
582 | TokenGroup r; | ||||
583 | for (TokenGroup::iterator ii=tgt.begin(), iie=tgt.end(); ii!=iie; ++ii) { | ||||
584 | if ((*ii)->text() && (*ii)->canContainMarkup()) { | ||||
585 | TokenGroup t=_encodeProcessedItems(*(*ii)->text(), replacements); | ||||
586 | r.splice(r.end(), t); | ||||
587 | } else r.push_back(*ii); | ||||
588 | } | ||||
589 | | ||||
590 | return r; | ||||
591 | } | ||||
592 | | ||||
593 | TokenGroup RawText::_encodeProcessedItems(const std::string &src, | ||||
594 | ReplacementTable& replacements) | ||||
595 | { | ||||
596 | static const boost::regex cReplaced("\x01@(#?[0-9]*)@.+?\x01"); | ||||
597 | | ||||
598 | TokenGroup r; | ||||
599 | std::string::const_iterator prev=src.begin(); | ||||
600 | while (1) { | ||||
601 | boost::smatch m; | ||||
602 | if (boost::regex_search(prev, src.end(), m, cReplaced)) { | ||||
603 | std::string pre=std::string(prev, m[0].first); | ||||
604 | if (!pre.empty()) r.push_back(TokenPtr(new RawText(pre))); | ||||
605 | prev=m[0].second; | ||||
606 | | ||||
607 | std::string ref=m[1]; | ||||
608 | if (ref[0]=='#') { | ||||
609 | size_t n=boost::lexical_cast<size_t>(ref.substr(1)); | ||||
610 | r.push_back(TokenPtr(new EscapedCharacter(escapedCharacter(n)))); | ||||
611 | } else if (!ref.empty()) { | ||||
612 | size_t n=boost::lexical_cast<size_t>(ref); | ||||
613 | | ||||
614 | assert(n<replacements.size()); | ||||
615 | r.push_back(replacements[n]); | ||||
616 | } // Otherwise just eat it | ||||
617 | } else { | ||||
618 | std::string pre=std::string(prev, src.end()); | ||||
619 | if (!pre.empty()) r.push_back(TokenPtr(new RawText(pre))); | ||||
620 | break; | ||||
621 | } | ||||
622 | } | ||||
623 | return r; | ||||
624 | } | ||||
625 | | ||||
626 | std::string RawText::_restoreProcessedItems(const std::string &src, | ||||
627 | ReplacementTable& replacements) | ||||
628 | { | ||||
629 | static const boost::regex cReplaced("\x01@(#?[0-9]*)@.+?\x01"); | ||||
630 | | ||||
631 | std::ostringstream r; | ||||
632 | std::string::const_iterator prev=src.begin(); | ||||
633 | while (1) { | ||||
634 | boost::smatch m; | ||||
635 | if (boost::regex_search(prev, src.end(), m, cReplaced)) { | ||||
636 | std::string pre=std::string(prev, m[0].first); | ||||
637 | if (!pre.empty()) r << pre; | ||||
638 | prev=m[0].second; | ||||
639 | | ||||
640 | std::string ref=m[1]; | ||||
641 | if (ref[0]=='#') { | ||||
642 | size_t n=boost::lexical_cast<size_t>(ref.substr(1)); | ||||
643 | r << '\\' << escapedCharacter(n); | ||||
644 | } else if (!ref.empty()) { | ||||
645 | size_t n=boost::lexical_cast<size_t>(ref); | ||||
646 | | ||||
647 | assert(n<replacements.size()); | ||||
648 | replacements[n]->writeAsOriginal(r); | ||||
649 | } // Otherwise just eat it | ||||
650 | } else { | ||||
651 | std::string pre=std::string(prev, src.end()); | ||||
652 | if (!pre.empty()) r << pre; | ||||
653 | break; | ||||
654 | } | ||||
655 | } | ||||
656 | return r.str(); | ||||
657 | } | ||||
658 | | ||||
659 | HtmlAnchorTag::HtmlAnchorTag(const std::string& url, const std::string& title): | ||||
660 | TextHolder("<a href=\""+encodeString(url, cQuotes|cAmps)+"\"" | ||||
661 | +(title.empty() ? std::string() : " title=\""+encodeString(title, cQuotes|cAmps)+"\"") | ||||
662 | +">", false, 0) | ||||
663 | { | ||||
664 | // This space deliberately blank. ;-) | ||||
665 | } | ||||
666 | | ||||
667 | void CodeBlock::writeAsHtml(std::ostream& out) const { | ||||
668 | out << "<pre><code>"; | ||||
669 | TextHolder::writeAsHtml(out); | ||||
670 | out << "</code></pre>\n\n"; | ||||
671 | } | ||||
672 | | ||||
673 | void CodeSpan::writeAsHtml(std::ostream& out) const { | ||||
674 | out << "<code>"; | ||||
675 | TextHolder::writeAsHtml(out); | ||||
676 | out << "</code>"; | ||||
677 | } | ||||
678 | | ||||
679 | void CodeSpan::writeAsOriginal(std::ostream& out) const { | ||||
680 | out << '`' << *text() << '`'; | ||||
681 | } | ||||
682 | | ||||
683 | | ||||
684 | | ||||
685 | void Container::writeAsHtml(std::ostream& out) const { | ||||
686 | preWrite(out); | ||||
687 | for (CTokenGroupIter i=mSubTokens.begin(), ie=mSubTokens.end(); i!=ie; ++i) | ||||
688 | (*i)->writeAsHtml(out); | ||||
689 | postWrite(out); | ||||
690 | } | ||||
691 | | ||||
692 | void Container::writeToken(size_t indent, std::ostream& out) const { | ||||
693 | out << std::string(indent*2, ' ') << containerName() << endl; | ||||
694 | for (CTokenGroupIter ii=mSubTokens.begin(), iie=mSubTokens.end(); ii!=iie; | ||||
695 | ++ii) | ||||
696 | (*ii)->writeToken(indent+1, out); | ||||
697 | } | ||||
698 | | ||||
699 | optional<TokenGroup> Container::processSpanElements(const LinkIds& idTable) { | ||||
700 | TokenGroup t; | ||||
701 | for (CTokenGroupIter ii=mSubTokens.begin(), iie=mSubTokens.end(); ii!=iie; | ||||
702 | ++ii) | ||||
703 | { | ||||
704 | if ((*ii)->text()) { | ||||
705 | optional<TokenGroup> subt=(*ii)->processSpanElements(idTable); | ||||
706 | if (subt) { | ||||
707 | if (subt->size()>1) t.push_back(TokenPtr(new Container(*subt))); | ||||
708 | else if (!subt->empty()) t.push_back(*subt->begin()); | ||||
709 | } else t.push_back(*ii); | ||||
710 | } else { | ||||
711 | optional<TokenGroup> subt=(*ii)->processSpanElements(idTable); | ||||
712 | if (subt) { | ||||
713 | const Container *c=dynamic_cast<const Container*>((*ii).get()); | ||||
714 | assert(c!=0); | ||||
715 | t.push_back(c->clone(*subt)); | ||||
716 | } else t.push_back(*ii); | ||||
717 | } | ||||
718 | } | ||||
719 | swapSubtokens(t); | ||||
720 | return none; | ||||
721 | } | ||||
722 | | ||||
723 | UnorderedList::UnorderedList(const TokenGroup& contents, bool paragraphMode) { | ||||
724 | if (paragraphMode) { | ||||
725 | // Change each of the text items into paragraphs | ||||
726 | for (CTokenGroupIter i=contents.begin(), ie=contents.end(); i!=ie; ++i) { | ||||
727 | token::ListItem *item=dynamic_cast<token::ListItem*>((*i).get()); | ||||
728 | assert(item!=0); | ||||
729 | item->inhibitParagraphs(false); | ||||
730 | mSubTokens.push_back(*i); | ||||
731 | } | ||||
732 | } else mSubTokens=contents; | ||||
733 | } | ||||
734 | | ||||
735 | | ||||
736 | | ||||
737 | void BoldOrItalicMarker::writeAsHtml(std::ostream& out) const { | ||||
738 | if (!mDisabled) { | ||||
739 | if (mMatch!=0) { | ||||
740 | assert(mSize>=1 && mSize<=3); | ||||
741 | if (mOpenMarker) { | ||||
742 | out << (mSize==1 ? "<em>" : mSize==2 ? "<strong>" : "<strong><em>"); | ||||
743 | } else { | ||||
744 | out << (mSize==1 ? "</em>" : mSize==2 ? "</strong>" : "</em></strong>"); | ||||
745 | } | ||||
746 | } else out << std::string(mSize, mTokenCharacter); | ||||
747 | } | ||||
748 | } | ||||
749 | | ||||
750 | void BoldOrItalicMarker::writeToken(std::ostream& out) const { | ||||
751 | if (!mDisabled) { | ||||
752 | if (mMatch!=0) { | ||||
753 | std::string type=(mSize==1 ? "italic" : mSize==2 ? "bold" : "italic&bold"); | ||||
754 | if (mOpenMarker) { | ||||
755 | out << "Matched open-" << type << " marker" << endl; | ||||
756 | } else { | ||||
757 | out << "Matched close-" << type << " marker" << endl; | ||||
758 | } | ||||
759 | } else { | ||||
760 | if (mOpenMarker) out << "Unmatched bold/italic open marker: " << | ||||
761 | std::string(mSize, mTokenCharacter) << endl; | ||||
762 | else out << "Unmatched bold/italic close marker: " << | ||||
763 | std::string(mSize, mTokenCharacter) << endl; | ||||
764 | } | ||||
765 | } | ||||
766 | } | ||||
767 | | ||||
768 | void Image::writeAsHtml(std::ostream& out) const { | ||||
769 | out << "<img src=\"" << mUrl << "\" alt=\"" << mAltText << "\""; | ||||
770 | if (!mTitle.empty()) out << " title=\"" << mTitle << "\""; | ||||
771 | out << "/>"; | ||||
772 | } | ||||
773 | | ||||
774 | } // namespace token | ||||
775 | } // namespace markdown |