Jump to content

User:DavidBrooks/UndoRelinksModule

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

This module should "unlink" any redlinks (wikilinks whose target does not exist) in the newly loaded article. It will only change pages in Article space, and will not change any that link that points to another namespace. Let me know (via this talk page) if you have any problems or other suggestions.

To use:

  1. Before loading the first article, Tools Menu, click "Make Module"
  2. Paste the code below into the edit window
  3. Make sure "Enabled" is checked
  4. Click "Make Module". There should be a green-backgound "Module compiled and loaded" message
  5. Click "Close"

If you Save Settings, you won't need to repeat the above steps.

        private readonly System.Xml.XmlDocument xmlResult = new System.Xml.XmlDocument();

        // Don't inline these, to save pressure on the cache
        private readonly Regex embeddedLinkRE = new Regex(@".*\[\[.*\[\[");
        private readonly Regex extractLinkRE = new Regex(@".*\[\[(.*?)[#|\]].*");

        public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
        {
            Summary = String.Empty;
            Skip = false;

            if (wikiNamespace != 0)
                return ArticleText; // Only edit Mainspace articles

            string[] allWikiLinks = WikiFunctions.Parse.Parsers.GetAllWikiLinks(ArticleText).
                Where(l => !(l.Contains(':') || embeddedLinkRE.IsMatch(l))). // embedded link is probably rare
                Select(l => extractLinkRE.Replace(l, "$1")).  // links are still wrapped in [[...]]
                ToArray();

            string newtext = ArticleText;

            // The API has a limit of 50 titles, but to avoid overlong URLs it does no harm to
            // chop the list up. It could also be useful to limit the number of characters, maybe.
            string apiAction = "https://" + awb.LangCode + "." + awb.Project +
                ".org/w/api.php?action=query&prop=pageprops&ppprop=displaytitle&format=xml&titles=";
            const int batchSize = 25;

            StringBuilder titleList = new StringBuilder(apiAction);
            for (int i = 0; i < allWikiLinks.Length; i++) {
                titleList.Append(allWikiLinks[i]);
                if (((i + 1) % batchSize) == 0 || i == allWikiLinks.Length - 1) {
                    try {
                        System.Net.HttpWebRequest req =
                            (System.Net.HttpWebRequest)System.Net.WebRequest.Create(titleList.ToString());
                        req.UserAgent = "AWB redlink remover";
                        using (System.Net.WebResponse resp = req.GetResponse()) {
                            xmlResult.Load(resp.GetResponseStream());
                        }
                    }
                    catch {
                        // Choices: return the fixes so far, return the original list, and/or pop up a warning
                        return newtext;
                    }

                    foreach (System.Xml.XmlNode titleNode in xmlResult.GetElementsByTagName("page")) {

                        System.Xml.XmlElement pageElement = titleNode as System.Xml.XmlElement;

                        // Something is wrong if the node isn't an XmlElement, but check anyway.
                        // And, in case you are wondering, GetAttribute doesn't distinguish nonexistent
                        // from a value of "", which is nuts but useful in the "something wrong" test below.
                        if (pageElement != null && pageElement.HasAttribute("missing")) {
                            string redTitle = pageElement.GetAttribute("title");
                            if (redTitle.Length == 0)   // Also probably indicates something wrong
                                continue;

                            // The title is normalized, so matches need a case-independent first letter.
                            // And Regex.Escape puts a \ before a space.
                            string escapeTitle = Regex.Escape(redTitle.Substring(1)).Replace(@"\ ", "[_ ]");
                            Regex matchPattern = new Regex(@"\[\[ *((?i:" + redTitle[0] + ")" + escapeTitle +
                                @") *(?:\|(.+? *))?]]");
                            newtext = matchPattern.Replace(newtext, Replacement);
                            Summary = "Redlink(s) removed";
                        }
                    }

                    titleList.Length = apiAction.Length;    // truncate
                } else {
                    titleList.Append('|');
                }
            }
            return newtext;
        }

        private string Replacement(Match linkMatch)
        {
            Group pipeGroup = linkMatch.Groups[2];
            return pipeGroup.Success ? pipeGroup.Value : linkMatch.Groups[1].Value;
        }