alps/plugins/viewhtml/sanitize.go

238 lines
5.1 KiB
Go

package alpsviewhtml
import (
"bytes"
"fmt"
"net/url"
"regexp"
"strings"
alpsbase "git.sr.ht/~emersion/alps/plugins/base"
"github.com/aymerick/douceur/css"
cssparser "github.com/chris-ramon/douceur/parser"
"github.com/microcosm-cc/bluemonday"
"golang.org/x/net/html"
)
// TODO: this doesn't accomodate for quoting
var (
cssURLRegexp = regexp.MustCompile(`url\([^)]*\)`)
cssExprRegexp = regexp.MustCompile(`expression\([^)]*\)`)
)
var allowedStyles = map[string]bool{
"direction": true,
"font": true,
"font-family": true,
"font-style": true,
"font-variant": true,
"font-size": true,
"font-weight": true,
"letter-spacing": true,
"line-height": true,
"text-align": true,
"text-decoration": true,
"text-indent": true,
"text-overflow": true,
"text-shadow": true,
"text-transform": true,
"white-space": true,
"word-spacing": true,
"word-wrap": true,
"vertical-align": true,
"color": true,
"background": true,
"background-color": true,
"background-image": true,
"background-repeat": true,
"border": true,
"border-color": true,
"border-radius": true,
"height": true,
"margin": true,
"padding": true,
"width": true,
"max-width": true,
"min-width": true,
"clear": true,
"float": true,
"border-collapse": true,
"border-spacing": true,
"caption-side": true,
"empty-cells": true,
"table-layout": true,
"list-style-type": true,
"list-style-position": true,
}
type sanitizer struct {
msg *alpsbase.IMAPMessage
allowRemoteResources bool
hasRemoteResources bool
}
func (san *sanitizer) sanitizeImageURL(src string) string {
u, err := url.Parse(src)
if err != nil {
return "about:blank"
}
switch strings.ToLower(u.Scheme) {
// TODO: mid support?
case "cid":
if san.msg == nil {
return "about:blank"
}
part := san.msg.PartByID(u.Opaque)
if part == nil || !strings.HasPrefix(part.MIMEType, "image/") {
return "about:blank"
}
return part.URL(true).String()
case "https":
san.hasRemoteResources = true
if !proxyEnabled || !san.allowRemoteResources {
return "about:blank"
}
proxyURL := url.URL{Path: "/proxy"}
proxyQuery := make(url.Values)
proxyQuery.Set("src", u.String())
proxyURL.RawQuery = proxyQuery.Encode()
return proxyURL.String()
default:
return "about:blank"
}
}
func (san *sanitizer) sanitizeCSSDecls(decls []*css.Declaration) []*css.Declaration {
sanitized := make([]*css.Declaration, 0, len(decls))
for _, decl := range decls {
if !allowedStyles[decl.Property] {
continue
}
if cssExprRegexp.FindStringIndex(decl.Value) != nil {
continue
}
// TODO: more robust CSS declaration parsing
decl.Value = cssURLRegexp.ReplaceAllString(decl.Value, "url(about:blank)")
sanitized = append(sanitized, decl)
}
return sanitized
}
func (san *sanitizer) sanitizeCSSRule(rule *css.Rule) {
// Disallow @import
if rule.Kind == css.AtRule && strings.EqualFold(rule.Name, "@import") {
rule.Prelude = "url(about:blank)"
}
rule.Declarations = san.sanitizeCSSDecls(rule.Declarations)
for _, child := range rule.Rules {
san.sanitizeCSSRule(child)
}
}
func (san *sanitizer) sanitizeNode(n *html.Node) {
if n.Type == html.ElementNode {
if strings.EqualFold(n.Data, "img") {
for i := range n.Attr {
attr := &n.Attr[i]
if strings.EqualFold(attr.Key, "src") {
attr.Val = san.sanitizeImageURL(attr.Val)
}
}
} else if strings.EqualFold(n.Data, "style") {
var s string
c := n.FirstChild
for c != nil {
if c.Type == html.TextNode {
s += c.Data
}
next := c.NextSibling
n.RemoveChild(c)
c = next
}
stylesheet, err := cssparser.Parse(s)
if err != nil {
s = ""
} else {
for _, rule := range stylesheet.Rules {
san.sanitizeCSSRule(rule)
}
s = stylesheet.String()
}
n.AppendChild(&html.Node{
Type: html.TextNode,
Data: s,
})
}
for i := range n.Attr {
// Don't use `i, attr := range n.Attr` since `attr` would be a copy
attr := &n.Attr[i]
if strings.EqualFold(attr.Key, "style") {
decls, err := cssparser.ParseDeclarations(attr.Val)
if err != nil {
attr.Val = ""
continue
}
decls = san.sanitizeCSSDecls(decls)
attr.Val = ""
for _, d := range decls {
attr.Val += d.String()
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
san.sanitizeNode(c)
}
}
func (san *sanitizer) sanitizeHTML(b []byte) ([]byte, error) {
doc, err := html.Parse(bytes.NewReader(b))
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %v", err)
}
san.sanitizeNode(doc)
var buf bytes.Buffer
if err := html.Render(&buf, doc); err != nil {
return nil, fmt.Errorf("failed to render HTML: %v", err)
}
b = buf.Bytes()
// bluemonday must always be run last
p := bluemonday.UGCPolicy()
// TODO: use bluemonday's AllowStyles once it's released and
// supports <style>
p.AllowElements("style")
p.AllowAttrs("style").Globally()
p.AddTargetBlankToFullyQualifiedLinks(true)
p.RequireNoFollowOnLinks(true)
return p.SanitizeBytes(b), nil
}