source: code/trunk/morty.go@ 10

Last change on this file since 10 was 10, checked in by asciimoo, 9 years ago

[enh] disallow crawling bots

File size: 13.6 KB
Line 
1package main
2
3import (
4 "bytes"
5 "crypto/hmac"
6 "crypto/sha256"
7 "encoding/hex"
8 "errors"
9 "flag"
10 "fmt"
11 "io"
12 "log"
13 "net/url"
14 "path"
15 "regexp"
16 "strings"
17 "time"
18
19 "github.com/valyala/fasthttp"
20 "golang.org/x/net/html"
21 "golang.org/x/text/encoding/charmap"
22)
23
24const (
25 STATE_DEFAULT int = 0
26 STATE_IN_STYLE int = 1
27 STATE_IN_NOSCRIPT int = 2
28)
29
30var CLIENT *fasthttp.Client = &fasthttp.Client{
31 MaxResponseBodySize: 10 * 1024 * 1024, // 10M
32}
33
34var CSS_URL_REGEXP *regexp.Regexp = regexp.MustCompile("(url\\(|@import +)(['\"]?)([\u0009\u0021\u0023-\u0026\u0028\u002a-\u007E]+)(['\"]?)\\)?")
35
36var UNSAFE_ELEMENTS [][]byte = [][]byte{
37 []byte("applet"),
38 []byte("canvas"),
39 []byte("embed"),
40 //[]byte("iframe"),
41 []byte("script"),
42}
43
44var SAFE_ATTRIBUTES [][]byte = [][]byte{
45 []byte("abbr"),
46 []byte("accesskey"),
47 []byte("align"),
48 []byte("alt"),
49 []byte("autocomplete"),
50 []byte("charset"),
51 []byte("checked"),
52 []byte("class"),
53 []byte("content"),
54 []byte("contenteditable"),
55 []byte("contextmenu"),
56 []byte("dir"),
57 []byte("for"),
58 []byte("height"),
59 []byte("hidden"),
60 []byte("id"),
61 []byte("lang"),
62 []byte("media"),
63 []byte("method"),
64 []byte("name"),
65 []byte("nowrap"),
66 []byte("placeholder"),
67 []byte("property"),
68 []byte("rel"),
69 []byte("spellcheck"),
70 []byte("tabindex"),
71 []byte("target"),
72 []byte("title"),
73 []byte("translate"),
74 []byte("type"),
75 []byte("value"),
76 []byte("width"),
77}
78
79var SELF_CLOSING_ELEMENTS [][]byte = [][]byte{
80 []byte("area"),
81 []byte("base"),
82 []byte("br"),
83 []byte("col"),
84 []byte("embed"),
85 []byte("hr"),
86 []byte("img"),
87 []byte("input"),
88 []byte("keygen"),
89 []byte("link"),
90 []byte("meta"),
91 []byte("param"),
92 []byte("source"),
93 []byte("track"),
94 []byte("wbr"),
95}
96
97type Proxy struct {
98 Key []byte
99 RequestTimeout time.Duration
100}
101
102type RequestConfig struct {
103 Key []byte
104 baseURL *url.URL
105}
106
107var HTML_FORM_EXTENSION string = `<input type="hidden" name="mortyurl" value="%s" /><input type="hidden" name="mortyhash" value="%s" />`
108
109var HTML_BODY_EXTENSION string = `
110<div id="mortyheader">
111 <input type="checkbox" id="mortytoggle" autocomplete="off" />
112 <div><p>This is a proxified and sanitized view of the page,<br />visit <a href="%s">original site</a>.</p><div><p><label for="mortytoggle">hide</label></p></div></div>
113</div>
114<style>
115#mortyheader { position: fixed; top: 15%%; left: 0; max-width: 10em; color: #444; overflow: hidden; z-index: 110000; font-size: 0.9em; padding: 1em 1em 1em 0; margin: 0; }
116#mortyheader a { color: #3498db; }
117#mortyheader p { padding: 0; margin: 0; }
118#mortyheader > div { padding: 8px; font-size: 0.9em; border-width: 4px 4px 4px 0; border-style: solid; border-color: #1abc9c; background: #FFF; line-height: 1em; }
119#mortyheader label { text-align: right; cursor: pointer; display: block; color: #444; padding: 0; margin: 0; }
120input[type=checkbox]#mortytoggle { display: none; }
121input[type=checkbox]#mortytoggle:checked ~ div { display: none; }
122</style>
123`
124
125func (p *Proxy) RequestHandler(ctx *fasthttp.RequestCtx) {
126
127 if appRequestHandler(ctx) {
128 return
129 }
130
131 requestHash := popRequestParam(ctx, []byte("mortyhash"))
132
133 requestURI := popRequestParam(ctx, []byte("mortyurl"))
134
135 if requestURI == nil {
136 p.breakOnError(ctx, errors.New(`missing "mortyurl" URL parameter`))
137 return
138 }
139
140 if p.Key != nil {
141 if !verifyRequestURI(requestURI, requestHash, p.Key) {
142 p.breakOnError(ctx, errors.New("invalid hash"))
143 return
144 }
145 }
146
147 parsedURI, err := url.Parse(string(requestURI))
148
149 if p.breakOnError(ctx, err) {
150 return
151 }
152
153 req := fasthttp.AcquireRequest()
154 defer fasthttp.ReleaseRequest(req)
155
156 reqQuery := parsedURI.Query()
157 ctx.QueryArgs().VisitAll(func(key, value []byte) {
158 reqQuery.Add(string(key), string(value))
159 })
160
161 parsedURI.RawQuery = reqQuery.Encode()
162
163 uriStr := parsedURI.String()
164
165 log.Println("getting", uriStr)
166
167 req.SetRequestURI(uriStr)
168 req.Header.SetUserAgentBytes([]byte("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"))
169
170 resp := fasthttp.AcquireResponse()
171 defer fasthttp.ReleaseResponse(resp)
172
173 req.Header.SetMethodBytes(ctx.Method())
174 if ctx.IsPost() || ctx.IsPut() {
175 req.SetBody(ctx.PostBody())
176 }
177
178 if p.breakOnError(ctx, CLIENT.DoTimeout(req, resp, p.RequestTimeout)) {
179 return
180 }
181
182 if resp.StatusCode() != 200 {
183 switch resp.StatusCode() {
184 case 301, 302, 303, 307, 308:
185 loc := resp.Header.Peek("Location")
186 if loc != nil {
187 url, err := proxifyURI(&RequestConfig{p.Key, parsedURI}, string(loc))
188 if err == nil {
189 ctx.SetStatusCode(resp.StatusCode())
190 ctx.Response.Header.Add("Location", url)
191 log.Println("redirect to", string(loc))
192 return
193 }
194 }
195 }
196 log.Println("invalid request:", resp.StatusCode())
197 return
198 }
199
200 contentType := resp.Header.Peek("Content-Type")
201
202 if contentType == nil {
203 p.breakOnError(ctx, errors.New("invalid content type"))
204 return
205 }
206
207 contentInfo := bytes.SplitN(contentType, []byte(";"), 2)
208
209 var responseBody []byte
210
211 if len(contentInfo) == 2 && bytes.Contains(contentInfo[1], []byte("ISO-8859-2")) && bytes.Contains(contentInfo[0], []byte("text")) {
212 var err error
213 responseBody, err = charmap.ISO8859_2.NewDecoder().Bytes(resp.Body())
214 if p.breakOnError(ctx, err) {
215 return
216 }
217 } else {
218 responseBody = resp.Body()
219 }
220
221 ctx.SetContentType(fmt.Sprintf("%s; charset=UTF-8", contentInfo[0]))
222
223 switch {
224 case bytes.Contains(contentType, []byte("css")):
225 sanitizeCSS(&RequestConfig{p.Key, parsedURI}, ctx, responseBody)
226 case bytes.Contains(contentType, []byte("html")):
227 sanitizeHTML(&RequestConfig{p.Key, parsedURI}, ctx, responseBody)
228 default:
229 ctx.Write(responseBody)
230 }
231}
232
233func appRequestHandler(ctx *fasthttp.RequestCtx) bool {
234 if bytes.Equal(ctx.Path(), []byte("/robots.txt")) {
235 ctx.SetContentType("text/plain")
236 ctx.Write([]byte("User-Agent: *\nDisallow: /\n"))
237 return true
238 }
239 return false
240}
241
242func popRequestParam(ctx *fasthttp.RequestCtx, paramName []byte) []byte {
243 param := ctx.QueryArgs().PeekBytes(paramName)
244
245 if param == nil {
246 param = ctx.PostArgs().PeekBytes(paramName)
247 if param != nil {
248 ctx.PostArgs().DelBytes(paramName)
249 }
250 } else {
251 ctx.QueryArgs().DelBytes(paramName)
252 }
253
254 return param
255}
256
257func sanitizeCSS(rc *RequestConfig, out io.Writer, css []byte) {
258 // TODO
259
260 urlSlices := CSS_URL_REGEXP.FindAllSubmatchIndex(css, -1)
261
262 if urlSlices == nil {
263 out.Write(css)
264 return
265 }
266
267 startIndex := 0
268
269 for _, s := range urlSlices {
270 urlStart := s[6]
271 urlEnd := s[7]
272
273 if uri, err := proxifyURI(rc, string(css[urlStart:urlEnd])); err == nil {
274 out.Write(css[startIndex:urlStart])
275 out.Write([]byte(uri))
276 startIndex = urlEnd
277 } else {
278 log.Println("cannot proxify css uri:", css[urlStart:urlEnd])
279 }
280 }
281 if startIndex < len(css) {
282 out.Write(css[startIndex:len(css)])
283 }
284}
285
286func sanitizeHTML(rc *RequestConfig, out io.Writer, htmlDoc []byte) {
287 r := bytes.NewReader(htmlDoc)
288 decoder := html.NewTokenizer(r)
289 decoder.AllowCDATA(true)
290
291 unsafeElements := make([][]byte, 0, 8)
292 state := STATE_DEFAULT
293
294 for {
295 token := decoder.Next()
296 if token == html.ErrorToken {
297 err := decoder.Err()
298 if err != io.EOF {
299 log.Println("failed to parse HTML:")
300 }
301 break
302 }
303
304 if len(unsafeElements) == 0 {
305
306 switch token {
307 case html.StartTagToken, html.SelfClosingTagToken:
308 tag, hasAttrs := decoder.TagName()
309 safe := !inArray(tag, UNSAFE_ELEMENTS)
310 if !safe {
311 if !inArray(tag, SELF_CLOSING_ELEMENTS) {
312 var unsafeTag []byte = make([]byte, len(tag))
313 copy(unsafeTag, tag)
314 unsafeElements = append(unsafeElements, unsafeTag)
315 }
316 break
317 }
318 if bytes.Equal(tag, []byte("noscript")) {
319 state = STATE_IN_NOSCRIPT
320 break
321 }
322 var attrs [][][]byte
323 fmt.Fprintf(out, "<%s", tag)
324 if hasAttrs {
325 for {
326 attrName, attrValue, moreAttr := decoder.TagAttr()
327 attrs = append(attrs, [][]byte{attrName, attrValue})
328 if !moreAttr {
329 break
330 }
331 }
332 if bytes.Equal(tag, []byte("meta")) {
333 sanitizeMetaAttrs(rc, out, attrs)
334 } else {
335 sanitizeAttrs(rc, out, attrs)
336 }
337 }
338 if token == html.SelfClosingTagToken {
339 fmt.Fprintf(out, " />")
340 } else {
341 fmt.Fprintf(out, ">")
342 if bytes.Equal(tag, []byte("style")) {
343 state = STATE_IN_STYLE
344 }
345 }
346 if bytes.Equal(tag, []byte("form")) {
347 var formURL *url.URL
348 for _, attr := range attrs {
349 if bytes.Equal(attr[0], []byte("action")) {
350 formURL, _ = url.Parse(string(attr[1]))
351 mergeURIs(rc.baseURL, formURL)
352 break
353 }
354 }
355 if formURL == nil {
356 formURL = rc.baseURL
357 }
358 urlStr := formURL.String()
359 var key string
360 if rc.Key != nil {
361 key = hash(urlStr, rc.Key)
362 }
363 fmt.Fprintf(out, HTML_FORM_EXTENSION, urlStr, key)
364
365 }
366
367 case html.EndTagToken:
368 tag, _ := decoder.TagName()
369 writeEndTag := true
370 switch string(tag) {
371 case "body":
372 fmt.Fprintf(out, HTML_BODY_EXTENSION, rc.baseURL.String())
373 case "style":
374 state = STATE_DEFAULT
375 case "noscript":
376 state = STATE_DEFAULT
377 writeEndTag = false
378 }
379 // skip noscript tags - only the tag, not the content, because javascript is sanitized
380 if writeEndTag {
381 fmt.Fprintf(out, "</%s>", tag)
382 }
383
384 case html.TextToken:
385 switch state {
386 case STATE_DEFAULT:
387 fmt.Fprintf(out, "%s", decoder.Raw())
388 case STATE_IN_STYLE:
389 sanitizeCSS(rc, out, decoder.Raw())
390 case STATE_IN_NOSCRIPT:
391 sanitizeHTML(rc, out, decoder.Raw())
392 }
393
394 case html.DoctypeToken, html.CommentToken:
395 out.Write(decoder.Raw())
396 }
397 } else {
398 switch token {
399 case html.StartTagToken:
400 tag, _ := decoder.TagName()
401 if inArray(tag, UNSAFE_ELEMENTS) {
402 unsafeElements = append(unsafeElements, tag)
403 }
404
405 case html.EndTagToken:
406 tag, _ := decoder.TagName()
407 if bytes.Equal(unsafeElements[len(unsafeElements)-1], tag) {
408 unsafeElements = unsafeElements[:len(unsafeElements)-1]
409 }
410 }
411 }
412 }
413}
414
415func sanitizeMetaAttrs(rc *RequestConfig, out io.Writer, attrs [][][]byte) {
416 var http_equiv []byte
417 var content []byte
418
419 for _, attr := range attrs {
420 attrName := attr[0]
421 attrValue := attr[1]
422 if bytes.Equal(attrName, []byte("http-equiv")) {
423 http_equiv = bytes.ToLower(attrValue)
424 }
425 if bytes.Equal(attrName, []byte("content")) {
426 content = attrValue
427 }
428 }
429
430 if bytes.Equal(http_equiv, []byte("refresh")) && bytes.Index(content, []byte(";url=")) != -1 {
431 parts := bytes.SplitN(content, []byte(";url="), 2)
432 if uri, err := proxifyURI(rc, string(parts[1])); err == nil {
433 fmt.Fprintf(out, ` http-equiv="refresh" content="%s;%s"`, parts[0], uri)
434 }
435 } else {
436 sanitizeAttrs(rc, out, attrs)
437 }
438
439}
440
441func sanitizeAttrs(rc *RequestConfig, out io.Writer, attrs [][][]byte) {
442 for _, attr := range attrs {
443 sanitizeAttr(rc, out, attr[0], attr[1])
444 }
445}
446
447func sanitizeAttr(rc *RequestConfig, out io.Writer, attrName, attrValue []byte) {
448 if inArray(attrName, SAFE_ATTRIBUTES) {
449 fmt.Fprintf(out, " %s=\"%s\"", attrName, attrValue)
450 return
451 }
452 switch string(attrName) {
453 case "src", "href", "action":
454 if uri, err := proxifyURI(rc, string(attrValue)); err == nil {
455 fmt.Fprintf(out, " %s=\"%s\"", attrName, uri)
456 } else {
457 log.Println("cannot proxify uri:", attrValue)
458 }
459 case "style":
460 fmt.Fprintf(out, " %s=\"", attrName)
461 sanitizeCSS(rc, out, attrValue)
462 out.Write([]byte("\""))
463 }
464}
465
466func mergeURIs(u1, u2 *url.URL) {
467 if u2.Scheme == "" || u2.Scheme == "//" {
468 u2.Scheme = u1.Scheme
469 }
470 if u2.Host == "" && u1.Path != "" {
471 u2.Host = u1.Host
472 if len(u2.Path) == 0 || u2.Path[0] != '/' {
473 u2.Path = path.Join(u1.Path[:strings.LastIndexByte(u1.Path, byte('/'))], u2.Path)
474 }
475 }
476}
477
478func proxifyURI(rc *RequestConfig, uri string) (string, error) {
479 // TODO check malicious data: - e.g. data:script
480 if strings.HasPrefix(uri, "data:") {
481 return uri, nil
482 }
483
484 if len(uri) > 0 && uri[0] == '#' {
485 return uri, nil
486 }
487
488 u, err := url.Parse(uri)
489 if err != nil {
490 return "", err
491 }
492 mergeURIs(rc.baseURL, u)
493
494 uri = u.String()
495
496 if rc.Key == nil {
497 return fmt.Sprintf("./?mortyurl=%s", url.QueryEscape(uri)), nil
498 }
499 return fmt.Sprintf("./?mortyhash=%s&mortyurl=%s", hash(uri, rc.Key), url.QueryEscape(uri)), nil
500}
501
502func inArray(b []byte, a [][]byte) bool {
503 for _, b2 := range a {
504 if bytes.Equal(b, b2) {
505 return true
506 }
507 }
508 return false
509}
510
511func hash(msg string, key []byte) string {
512 mac := hmac.New(sha256.New, key)
513 mac.Write([]byte(msg))
514 return hex.EncodeToString(mac.Sum(nil))
515}
516
517func verifyRequestURI(uri, hashMsg, key []byte) bool {
518 h := make([]byte, hex.DecodedLen(len(hashMsg)))
519 _, err := hex.Decode(h, hashMsg)
520 if err != nil {
521 log.Println("hmac error:", err)
522 return false
523 }
524 mac := hmac.New(sha256.New, key)
525 mac.Write(uri)
526 return hmac.Equal(h, mac.Sum(nil))
527}
528
529func (p *Proxy) breakOnError(ctx *fasthttp.RequestCtx, err error) bool {
530 if err == nil {
531 return false
532 }
533 log.Println("error:", err)
534 ctx.SetStatusCode(404)
535 ctx.SetContentType("text/html")
536 ctx.Write([]byte(`<!doctype html>
537<head>
538<title>MortyError</title>
539</head>
540<body><h2>Error!</h2>`))
541 ctx.Write([]byte("<h3>"))
542 ctx.Write([]byte(html.EscapeString(err.Error())))
543 ctx.Write([]byte("</h3>"))
544 if p.Key == nil {
545 ctx.Write([]byte(`
546<form action="post">
547 Visit url: <input placeholder="https://url.." name="mortyurl" />
548 <input type="submit" value="go" />
549</form>`))
550 }
551 ctx.Write([]byte(`
552</body>
553</html>`))
554 return true
555}
556
557func main() {
558
559 listen := flag.String("listen", "127.0.0.1:3000", "Listen address")
560 key := flag.String("key", "", "HMAC url validation key (hexadecimal encoded) - leave blank to disable")
561 requestTimeout := flag.Uint("timeout", 2, "Request timeout")
562 flag.Parse()
563
564 p := &Proxy{RequestTimeout: time.Duration(*requestTimeout) * time.Second}
565
566 if *key != "" {
567 p.Key = []byte(*key)
568 }
569
570 log.Println("listening on", *listen)
571
572 if err := fasthttp.ListenAndServe(*listen, p.RequestHandler); err != nil {
573 log.Fatal("Error in ListenAndServe:", err)
574 }
575}
Note: See TracBrowser for help on using the repository browser.