From 74eb81da2028412ae7cf2d88c9e8f7cd872399f8 Mon Sep 17 00:00:00 2001 From: jiangyong27 Date: Sat, 16 Dec 2023 19:24:36 +0800 Subject: [PATCH] htmlstrip --- util.go | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/util.go b/util.go index 1c03dfb..7677b85 100644 --- a/util.go +++ b/util.go @@ -4,7 +4,9 @@ import ( "errors" "fmt" "reflect" + "regexp" "strconv" + "strings" ) // format bytes number friendly @@ -89,3 +91,35 @@ func FormatBytes(bytes int64) string { } return fmt.Sprintf("%.2f%cB", float64(bytes)/float64(div), "KMGTPE"[exp]) } + +func HtmlStrip(src string) string { + //将HTML标签全转换成小写 + re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") + src = re.ReplaceAllStringFunc(src, strings.ToLower) + + //去除STYLE + re, _ = regexp.Compile("\\") + src = re.ReplaceAllString(src, "") + + //去除SCRIPT + re, _ = regexp.Compile("\\") + src = re.ReplaceAllString(src, "") + + //去除所有尖括号内的HTML代码,并换成换行符 + re, _ = regexp.Compile("\\<[\\S\\s]+?\\>") + src = re.ReplaceAllString(src, "") + + //去除连续的换行符 + re, _ = regexp.Compile("\\s{1,}") + src = re.ReplaceAllString(src, "") + + //去除〹这类字符 + //re, _ = regexp.Compile("&#\\d*;") + //src = re.ReplaceAllString(src, "") + + src = strings.ReplaceAll(src, " ", "") + src = strings.ReplaceAll(src, "nbsp;", "") + src = strings.ReplaceAll(src, "& nbsp;", "") + src = strings.ReplaceAll(src, " ", "") + return strings.TrimSpace(src) +}