纯文本中识别URI地址并转换成HTML

时间：2014-05-13 05:13:42 收藏：0 阅读：656

问题

有一段纯文本text, 欲将其插入DOM节点div中. text中可能有超链接, 邮件地址等. 如果有, 识别之.

分析

如果只是纯文本, 插入div中, 只要将div.innerText设置为text即可.
text中的URI地址可以用正则识别, 并将其替换为<a/>标签组成的字符串. 此时 text变成了HTML字符串html.
HTML字符串html可以赋值给div.innerHTML. 但如果原text中存在HTML语义的字符串呢? 因此, 在识别URI之前, 需要将原text作转义.

解决

uri-recognition.js

(function () {
    var trim = function (s) {
        /*jslint eqeq:true*/
        if (s == null || s === ‘‘) {
            return ‘‘;
        }
        // s 空格
        // 	 制表符
        // xA0 non-breaking spaces
        // 3000中文空格
        return String(s).replace(/^[s	xA03000]+/, ‘‘).
            replace(/[s	xA03000]+$/, ‘‘);
    },
    startsWith = function (s, sub) {
        s = String(s);
        return s.indexOf(sub) === 0;
    },
    test = function (str) {
        /*jslint maxlen: 100*/
        var URI_REG = /(https?://|www.|ssh://|ftp://)[a-z0-9&_+-?/.=#]+/i,
            MAIL_REG = /[a-z0-9_+-.]+@[a-z0-9_+-.]+/i;
        str = trim(String(str));
        return URI_REG.test(str) || MAIL_REG.test(str) || false;
    },
    /**
     * @param {String} str
     * @param {Function} replacer
     */
    replace = function (str, replacer) {
        /*jslint maxlen: 100*/
        var URI_REG = /(https?://|www.|ssh://|ftp://)[a-z0-9&_+-?/.=#]+/gi,
            MAIL_REG = /[a-z0-9_+-.]+@[a-z0-9_+-.]+/gi;

        str = trim(String(str));

        str = str.replace(URI_REG, function (match) {
            var newStr =  replacer({
                mail: false,
                fullURI: startsWith(match.toLowerCase(), ‘www.‘) ?
                        (‘http://‘ + match) : match,
                match: match
            });
            /*jslint eqeq: true*/
            return newStr == null ? match : newStr;
        });
        str = str.replace(MAIL_REG, function (match) {
            var newStr =  replacer({
                mail: true,
                fullURI: ‘mailto:‘ + match,
                match: match
            });
            /*jslint eqeq: true*/
            return newStr == null ? match : newStr;
        });
        return str;
    },
    uriRecognition = function (text) {
        var doc = document,
            html;
        text = trim(String(text));
        if (test(text)) {
            //use {} to escape
            text = text.replace(/{<}/g, ‘{{<}}‘).
                replace(/{>}/g, ‘{{>}}‘).
                replace(/</g, ‘{<}‘).
                replace(/>/g, ‘{>}‘);

            html = replace(text, function (info) {
                if (!info || !info.match || !info.fullURI) {
                    return null;
                }
                var link = doc.createElement(‘a‘);
                link.setAttribute(‘href‘, info.fullURI);
                /*jslint eqeq: true*/
                if (link.innerText != null) {
                    link.innerText = info.match;
                } else if (link.textContent != null) {
                    link.textContent = info.match;
                }
                return link.outerHTML;
            });

            html = html.replace(/{<}/g, ‘<‘).
                replace(/{>}/g, ‘>‘);

            return {
                content: html,
                isPlainText: false
            };
        }
        return {
            content: text,
            isPlainText: true
        };
    },
    setContentWithURIRecognition = function (el, text) {
        var result = uriRecognition(text);
        if (!result) {
            return;
        }
        if (result.isPlainText) {
            if (el.innerText != null) {
                el.innerText = result.content;
            } else if (el.textContent != null) {
                el.textContent = result.content;
            }
        } else {
            el.innerHTML = result.content;
        }
    };
    window.uriRecognition = uriRecognition;
    window.setContentWithURIRecognition = setContentWithURIRecognition;

})();

test.html

<!DOCTYPE HTML>
<html>
    <head>
        <meta http-equiv="content-type" content="text/html; charset=utf-8">
        <title>uri regcognition</title>
    </head>
    <body>
        <script src="./uri-recognition.js" type="text/javascript"></script>
        <script type="text/javascript">
            var text = ‘<a href="http://china.haiwainet.cn/n/2014/0509/c232587-20619235.html" ‘ +
                    ‘mon="ct=1&a=2&c=top&pn=8" target="_blank">‘ +
                    ‘纽约时报：阿里巴巴IPO将风险推向全新水平</a>‘ +
                    ‘ send to example@example.com xxxx‘,
                div = document.createElement(‘div‘);

            window.setContentWithURIRecognition(div, text);
            document.body.appendChild(div);
        </script>
    </body>
</html>

Chrome下测试OK.

纯文本中识别URI地址并转换成HTML,布布扣,bubuko.com