Thursday, July 24, 2014

Scrape Hidden Public APIs using C#

So you found a hidden unprotected API of a website you like.
Lets use dynamic objects to represent JSON objects and save them in our database. For later processing. There are legitimate reasons to do this. Say a crappy built in search system of the website.



Please note that following code totally depends on the website backend implementation. Do your research and change accordingly.

Post might feel like out of sequence. It was intentional.
lets begin:

const string ajaxUrl = "http://coolsite.com/some-stuff?page=";

Lets create a separate method to issue the http request.

private async static Task<int> GetResponse(HttpClient client, string url, int page)
    {
        int pageCount = 0;
        var response = await client.GetAsync(url + page);

        response.EnsureSuccessStatusCode();
        string responseBody = await response.Content.ReadAsStringAsync();

        var serializer = new JavaScriptSerializer();

// here we need to register our own Dymaic JSON converter.
// see code below
        serializer.RegisterConverters(new[] { new DynamicJsonConverter() });

        dynamic obj = serializer.Deserialize(responseBody, typeof(object));

// Great! now that we have a dynamic object
// look at the JSON response from the browser and
// call it like **obj.some_json_property**

        if (page == 1)
        {
            // I found out the page size is 50
            pageCount = Convert.ToInt32(Math.Ceiling((obj.total_stuff / 50.0))); 
        }

        foreach (var item in obj.stuff)
        {
            AddToOurList(item); 
        }

        return pageCount;
    }

Now the DynamicJsonConverter 
NOTE:  I found this from StackOverflow.(http://stackoverflow.com/a/3806407/959245) Creator, You are a hero.
using System;
    using System.Collections;
    using System.Collections.Generic;
    using System.Collections.ObjectModel;
    using System.Dynamic;
    using System.Linq;
    using System.Text;
    using System.Web.Script.Serialization;

public sealed class DynamicJsonConverter : JavaScriptConverter
{
    public override object Deserialize(IDictionary<string, object> dictionary, Type type, JavaScriptSerializer serializer)
    {
        if (dictionary == null)
            throw new ArgumentNullException("dictionary");

        return type == typeof(object) ? new DynamicJsonObject(dictionary) : null;
    }

    public override IDictionary<string, object> Serialize(object obj, JavaScriptSerializer serializer)
    {
        throw new NotImplementedException();
    }

    public override IEnumerable<Type> SupportedTypes
    {
        get { return new ReadOnlyCollection<Type>(new List<Type>(new[] { typeof(object) })); }
    }

    #region Nested type: DynamicJsonObject

    private sealed class DynamicJsonObject : DynamicObject
    {
        private readonly IDictionary<string, object> _dictionary;

        public DynamicJsonObject(IDictionary<string, object> dictionary)
        {
            if (dictionary == null)
                throw new ArgumentNullException("dictionary");
            _dictionary = dictionary;
        }

        public override string ToString()
        {
            var sb = new StringBuilder("{");
            ToString(sb);
            return sb.ToString();
        }

        private void ToString(StringBuilder sb)
        {
            var firstInDictionary = true;
            foreach (var pair in _dictionary)
            {
                if (!firstInDictionary)
                    sb.Append(",");
                firstInDictionary = false;
                var value = pair.Value;
                var name = pair.Key;
                if (value is string)
                {
                    sb.AppendFormat("{0}:\"{1}\"", name, value);
                }
                else if (value is IDictionary<string, object>)
                {
                    new DynamicJsonObject((IDictionary<string, object>)value).ToString(sb);
                }
                else if (value is ArrayList)
                {
                    sb.Append(name + ":[");
                    var firstInArray = true;
                    foreach (var arrayValue in (ArrayList)value)
                    {
                        if (!firstInArray)
                            sb.Append(",");
                        firstInArray = false;
                        if (arrayValue is IDictionary<string, object>)
                            new DynamicJsonObject((IDictionary<string, object>)arrayValue).ToString(sb);
                        else if (arrayValue is string)
                            sb.AppendFormat("\"{0}\"", arrayValue);
                        else
                            sb.AppendFormat("{0}", arrayValue);

                    }
                    sb.Append("]");
                }
                else
                {
                    sb.AppendFormat("{0}:{1}", name, value);
                }
            }
            sb.Append("}");
        }

        public override bool TryGetMember(GetMemberBinder binder, out object result)
        {
            if (!_dictionary.TryGetValue(binder.Name, out result))
            {
                // return null to avoid exception.  caller can check for null this way...
                result = null;
                return true;
            }

            result = WrapResultObject(result);
            return true;
        }

        public override bool TryGetIndex(GetIndexBinder binder, object[] indexes, out object result)
        {
            if (indexes.Length == 1 && indexes[0] != null)
            {
                if (!_dictionary.TryGetValue(indexes[0].ToString(), out result))
                {
                    // return null to avoid exception.  caller can check for null this way...
                    result = null;
                    return true;
                }

                result = WrapResultObject(result);
                return true;
            }

            return base.TryGetIndex(binder, indexes, out result);
        }

        private static object WrapResultObject(object result)
        {
            var dictionary = result as IDictionary<string, object>;
            if (dictionary != null)
                return new DynamicJsonObject(dictionary);

            var arrayList = result as ArrayList;
            if (arrayList != null && arrayList.Count > 0)
            {
                return arrayList[0] is IDictionary<string, object>
                    ? new List<object>(arrayList.Cast<IDictionary<string, object>>().Select(x => new DynamicJsonObject(x)))
                    : new List<object>(arrayList.Cast<object>());
            }

            return result;
        }
    }

    #endregion
}

We will be using a Task Parallel library to issue requests.
So we need a storage which can be modified by many threads. I'm gonna use a concurrent queue.
 private static ConcurrentQueue<Item> _items = new ConcurrentQueue<Item>();

private static void AddToOurList(dynamic item)
    {
        Item theItem = new Item
        { 
            CreatedAt = DateTime.Parse(item.created_at),
            ScrapedAt = DateTime.Now
        };

        _ads.Enqueue(theItem); 
    }

Lets move onto setting up requests.

Lets be polite to the server and try to mimic human behavior. Assuming they don't really check the headers when it comes to JSON API.

var client = new HttpClient();

// indicate that this is a ajax request
client.DefaultRequestHeaders.Add("X-Requested-With", "XMLHttpRequest");

// Lets pretend we are "Chrome"    
client.DefaultRequestHeaders.Add("User-Agent", 
     @"Mozilla/5.0 (Windows NT 6.1; WOW64) 
       AppleWebKit/537.36 (KHTML, like Gecko) 
       Chrome/35.0.1916.153 Safari/537.36");

// We want only JSON. keey HTML with you, stupid server!
client.DefaultRequestHeaders.Accept.Add(
       new MediaTypeWithQualityHeaderValue("application/json"));

Lets imagine that every request sends us the total number of records available. And you already know what the page size is when you look at the site's JavaScript sources.

 int pageCount = GetResponse(client, url, 1).Result;
We need a way to cancel the tasks.
CancellationTokenSource cts = new CancellationTokenSource();
        // Use ParallelOptions instance to store the CancellationToken
        ParallelOptions po = new ParallelOptions();
        po.CancellationToken = cts.Token;
        po.MaxDegreeOfParallelism = System.Environment.ProcessorCount;

Lets cancel the tasks when we press 'X'
// Run a task so that we can cancel from another thread.
        Task.Factory.StartNew(() =>
        { 
            var key = Console.ReadKey();

                if (key.Key == ConsoleKey.X)
                {
                    cts.Cancel();

                } 
        });

Run it using Parallel.For.
Parallel.For(2, pageCount, po, i =>
            {
                // Lets sleep for a while to mimic a human 
                // who is still reading thru the data
                if (i % 10 == 0)
                {
                    var n = new Random().Next(20000); 
                    Thread.Sleep(n);
                }

                try
                { 
                    // Issue the request
                    Task.WaitAny(GetResponse(client, url, i));
                }
                catch (Exception e)
                {
                    // OOps
                }
            });

If you want to save these to a database, then do it when the task was cancelled.