Navigates a site using MediaWiki if their default API is public.

MediaWikiApi.cs 5.8KB

    // // MediaWikiApi.cs // // Author: // Bryan Allred <bryan.allred@gmail.com> // // Copyright (c) 2012 Bryan Allred // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. using System; using System.Collections.Generic; using System.Linq; using System.Threading; using System.Xml.Linq; namespace MediaWikiCrawl { /// <summary> /// Media wiki API. /// </summary> public class MediaWikiApi { /// <summary> /// The seed. /// </summary> private static int seed = 42; /// <summary> /// The _base URI. /// </summary> private Uri _baseUri; /// <summary> /// Initializes a new instance of the <see cref="MediaWikiCrawl.MediaWikiApi"/> class. /// </summary> /// <param name='baseUri'> /// Base URI. /// </param> public MediaWikiApi(string baseUri = "/api.php") { if (!baseUri.ToLower().Contains("/api.php")) { baseUri += @"/api.php"; } this._baseUri = new Uri(baseUri); } /// <summary> /// Alls the images. /// </summary> /// <returns> /// The images. /// </returns> /// <param name='startsWith'> /// Starts with. /// </param> public IEnumerable<WikiImage> AllImages(string startsWith = "0") { // Formulate the URI. var uri = new Uri(string.Format( "{0}?format={1}&action={2}&list={3}&aifrom={4}", this._baseUri.ToString(), Webbie.Sanitize("xml"), Webbie.Sanitize("query"), Webbie.Sanitize("allimages"), Webbie.Sanitize(startsWith))); // Find the root of the API (it is re-used so we cache it). var xmlApi = XDocument.Parse(Webbie.Get(uri)).Descendants("api"); // Navigate the XML to formulate some images! var images = xmlApi .Descendants("query") .Descendants("allimages") .Descendants("img") .Select(x => new WikiImage { Name = x.Attribute("name").Value, Url = new Uri(x.Attribute("url").Value), DescriptionUrl = new Uri(x.Attribute("descriptionurl").Value) }) .ToList(); // Return what we can when we can. foreach (var img in images) { yield return img; } // Determine what to continue with. var continueWith = xmlApi.Descendants("query-continue").Count() == 0 ? string.Empty : xmlApi .Descendants("query-continue") .Descendants("allimages") .Select(x => x.Attribute("aifrom").Value) .First(); // If there is something left then recursively grab it. if (!string.IsNullOrWhiteSpace(continueWith)) { // Give a moment to reflect. seed = new Random(seed).Next() % 100; new AutoResetEvent(true).WaitOne(new TimeSpan(0, 0, seed)); foreach (var img in AllImages(continueWith)) { yield return img; } } } /// <summary> /// Page the specified title. /// </summary> /// <param name='title'> /// Title. /// </param> public WikiPage Page(string title) { if (string.IsNullOrWhiteSpace(title)) { throw new ArgumentNullException("title"); } return new WikiPage { Title = title, ImageTitles = this.PageImages(title) }; } /// <summary> /// Pages the image. /// </summary> /// <returns> /// The image. /// </returns> /// <param name='title'> /// Title. /// </param> public WikiImage PageImage(string title) { if (string.IsNullOrWhiteSpace(title)) { throw new ArgumentNullException("title"); } var uri = new Uri(string.Format( "{0}?format={1}&action={2}&titles={3}&prop={4}&iiprop={5}", this._baseUri.ToString(), Webbie.Sanitize("xml"), Webbie.Sanitize("query"), Webbie.Sanitize(title), Webbie.Sanitize("imageinfo"), Webbie.Sanitize("url"))); return (XDocument.Parse(Webbie.Get(uri)) .Descendants("api") .Descendants("query") .Descendants("pages") .Descendants("page") .Descendants("imageinfo") .Descendants("ii") .Select(x => new WikiImage { Url = new Uri(x.Attribute("url").Value ?? string.Empty), DescriptionUrl = new Uri(x.Attribute("descriptionurl").Value ?? string.Empty) })) .First(); } /// <summary> /// Pages the images. /// </summary> /// <returns> /// The images. /// </returns> /// <param name='title'> /// Title. /// </param> public ICollection<string> PageImages(string title) { if (string.IsNullOrWhiteSpace(title)) { throw new ArgumentNullException("title"); } var uri = new Uri(string.Format( "{0}?format={1}&action={2}&titles={3}&prop={4}", this._baseUri.ToString(), Webbie.Sanitize("xml"), Webbie.Sanitize("query"), Webbie.Sanitize(title), Webbie.Sanitize("images"))); return XDocument.Parse(Webbie.Get(uri)) .Descendants("api") .Descendants("query") .Descendants("pages") .Descendants("page") .Descendants("images") .Descendants("im") .Select(x => x.Attribute("title").Value) .ToList(); } } }