/** *************************************************** **/ /** **/ /** WORDPRESS EBOOK SCRAPER (FOR WEB SERIALS) **/ /** **/ /** *************************************************** **/ const string BASE_DIR_STASH = @"F:\Stash\eBook_scraper\"; const string BASE_DIR_OUT = @"F:\Home\Cloud\Dokumente\E-Books\Scraper\"; const string COMPARE_PROG = @"C:\Program Files\Beyond Compare 4\BCompare.exe"; //----------------------------------------------------------------------------------------------------// static readonly EpubParameter PH1 = new EpubParameter(Site.WP, "Parahumans", 1, "Worm", "John McCrae", "2011-06-11", "en", @"https://parahumans.wordpress.com/2011/06/11/1-1/"); static readonly EpubParameter PH2 = new EpubParameter(Site.WP, "Parahumans", 2, "Ward", "John McCrae", "2017-10-21", "en", @"https://www.parahumans.net/2017/10/21/glow-worm-0-1/"); static readonly EpubParameter PACT = new EpubParameter(Site.WP, "Pact", "John McCrae", "2013-12-17", "en", @"https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/"); static readonly EpubParameter TWIG = new EpubParameter(Site.WP, "Twig", "John McCrae", "2014-12-24", "en", @"https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/"); static readonly EpubParameter PALE = new EpubParameter(Site.WP, "Pale", "John McCrae", "2020-05-05", "en", @"https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/"); static readonly EpubParameter APGTE1 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 1, "A Practical Guide to Evil I", "David Verburg", "2015-03-24", "en", @"https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/"); static readonly EpubParameter APGTE2 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 2, "A Practical Guide to Evil II", "David Verburg", "2015-11-04", "en", @"https://practicalguidetoevil.wordpress.com/2015/11/04/prologue-2/"); static readonly EpubParameter APGTE3 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 3, "A Practical Guide to Evil III", "David Verburg", "2017-02-08", "en", @"https://practicalguidetoevil.wordpress.com/2017/02/08/prologue-3/"); static readonly EpubParameter APGTE4 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 4, "A Practical Guide to Evil IV", "David Verburg", "2018-04-09", "en", @"https://practicalguidetoevil.wordpress.com/2018/04/09/prologue-4/"); static readonly EpubParameter APGTE5 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 5, "A Practical Guide to Evil V", "David Verburg", "2019-01-05", "en", @"https://practicalguidetoevil.wordpress.com/2019/01/14/prologue-5/"); static readonly EpubParameter APGTE6 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 6, "A Practical Guide to Evil VI", "David Verburg", "2020-01-06", "en", @"https://practicalguidetoevil.wordpress.com/2020/01/06/prologue-6/"); static readonly EpubParameter APGTE7 = new EpubParameter(Site.WP, "A Practical Guide to Evil", 7, "A Practical Guide to Evil VII", "David Verburg", "2021-03-02", "en", @"https://practicalguidetoevil.wordpress.com/2021/03/02/prologue-7/"); static readonly EpubParameter TDE1 = new EpubParameter(Site.WW, "The Divine Elements", 1, "The Blood Legacy", "Daman Dasi", "2016-04-06", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-0/"); static readonly EpubParameter TDE2 = new EpubParameter(Site.WW, "The Divine Elements", 2, "The Desolate Mountains", "Daman Dasi", "2016-07-09", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-61/"); static readonly EpubParameter TDE3 = new EpubParameter(Site.WW, "The Divine Elements", 3, "Scion of Raizel", "Daman Dasi", "2017-06-15", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-115/"); static readonly EpubParameter TDE4 = new EpubParameter(Site.WW, "The Divine Elements", 4, "The Seventh Tower", "Daman Dasi", "2017-08-07", "en", @"http://www.wuxiaworld.com/tde-index/tde-chapter-179/"); static readonly EpubParameter SOTL = new EpubParameter(Site.WP, "Shadows of the Limelight", "Alexander Wales", "2015-04-18", "en", @"http://alexanderwales.com/shadows-of-the-limelight-ch-1-the-rooftop-races/"); static readonly EpubParameter UNSONG = new EpubParameter(Site.WP, "Unsong", "Scott Alexander", "2015-12-08", "en", @"http://unsongbook.com/prologue-2/"); static readonly EpubParameter TGAB1_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 1, "What Fresh Hell", "D. D. Webb", "2014-08-20", "en", @"https://tiraas.net/2014/08/20/book-1-prologue/"); static readonly EpubParameter TGAB1_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 2, "Spacious Skies, Amber Waves", "D. D. Webb", "2014-10-10", "en", @"https://tiraas.net/2014/10/10/2-1/"); static readonly EpubParameter TGAB1_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 3, "A Fistful of Blood", "D. D. Webb", "2014-12-01", "en", @"https://tiraas.net/2014/12/01/3-1/"); static readonly EpubParameter TGAB1_4 = new EpubParameter(Site.WP, "The Gods are Bastards", 4, "This Town Ain't Big Enough", "D. D. Webb", "2014-12-24", "en", @"https://tiraas.net/2014/12/24/4-1/"); static readonly EpubParameter TGAB2_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 5, "The Streets Where You Live", "D. D. Webb", "2015-02-24", "en", @"https://tiraas.net/2015/02/24/volume-2-prologue/"); static readonly EpubParameter TGAB2_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 6, "Crawling Chaos", "D. D. Webb", "2015-05-20", "en", @"https://tiraas.net/2015/05/20/6-1/"); static readonly EpubParameter TGAB2_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 7, "Hath No Fury", "D. D. Webb", "2015-08-03", "en", @"https://tiraas.net/2015/08/03/7-1/"); static readonly EpubParameter TGAB3_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 8, "The Mind and the Sword", "D. D. Webb", "2015-09-14", "en", @"https://tiraas.net/2015/09/14/prologue-volume-3/"); static readonly EpubParameter TGAB3_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 9, "Draw", "D. D. Webb", "2015-11-23", "en", @"https://tiraas.net/2015/11/23/9-1/"); static readonly EpubParameter TGAB3_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 10, "And Justice for All", "D. D. Webb", "2016-02-29", "en", @"https://tiraas.net/2016/02/29/10-1/"); static readonly EpubParameter TGAB4_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 11, "If You Can Make It Here", "D. D. Webb", "2016-07-29", "en", @"https://tiraas.net/2016/07/29/prologue-volume-4/"); static readonly EpubParameter TGAB4_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 12, "Sleeper", "D. D. Webb", "2016-11-18", "en", @"https://tiraas.net/2016/11/18/12-1/"); static readonly EpubParameter TGAB4_3 = new EpubParameter(Site.WP, "The Gods are Bastards", 13, "From Sea to Stormy Sea", "D. D. Webb", "2017-08-07", "en", @"https://tiraas.net/2017/08/07/13-1/"); static readonly EpubParameter TGAB5_1 = new EpubParameter(Site.WP, "The Gods are Bastards", 14, "Themselves Contend", "D. D. Webb", "2018-04-16", "en", @"https://tiraas.net/2018/04/16/prologue-volume-5/"); static readonly EpubParameter TGAB5_2 = new EpubParameter(Site.WP, "The Gods are Bastards", 15, "The Fae, the Fell, and the Holy", "D. D. Webb", "2018-12-14", "en", @"https://tiraas.net/2018/12/14/15-1/"); static readonly EpubParameter NSTAR_1 = new EpubParameter(Site.WP, "Netherstar", 1, "Awakening", "D. D. Webb", "2019-01-26", "en", @"https://netherstar.net/2019/01/26/chapter-1-i-meant-to-do-that/"); static readonly EpubParameter CHESTS = new EpubParameter(Site.RR, "Everybody Loves Large Chests", "Neven Iliev", "2016-10-27", "en", @"https://www.royalroad.com/fiction/8894/everybody-loves-large-chests/chapter/99919/prologue"); static readonly EpubParameter MWC = new EpubParameter(Site.RR, "Metaworld Chronicles", "Wutosama", "2018-09-22", "en", @"https://www.royalroad.com/fiction/14167/metaworld-chronicles/chapter/163574/chapter-1-some-things-begin-something-ends"); static readonly EpubParameter WTC = new EpubParameter(Site.RR, "Worth the Candle", "Alexander Wales", "2017-07-14", "en", @"https://www.royalroad.com/fiction/25137/worth-the-candle/chapter/366577/taking-the-fall"); static readonly EpubParameter WLD = new EpubParameter(Site.WP, "What Lies Dreaming", "Eneasz Brodski", "2018-11-11", "en", @"http://whatliesdreaming.com/1-joah/"); static readonly EpubParameter WI = new EpubParameter(Site.WP, "The Wandering Inn", "pirateaba", "2016-06-27", "en", @"https://wanderinginn.com/2016/07/27/1-00/"); static readonly EpubParameter RTW = new EpubParameter(Site.WW, "Release that Witch", "Er Mu", "2019-09-02", "en", @"https://www.wuxiaworld.co/Release-that-Witch/1235444.html"); static readonly EpubParameter MOL = new EpubParameter(Site.RR, "Mother of Learning", "Domagoj Kurmaic", "2019-11-03", "en", @"https://www.royalroad.com/fiction/21220/mother-of-learning/chapter/301778/1-good-morning-brother"); static readonly EpubParameter TML = new EpubParameter(Site.RR, "The Menocht Loop", "caerulex", "2020-04-10", "en", @"https://www.royalroad.com/fiction/31514/the-menocht-loop/chapter/479082/1-yet-again"); static readonly EpubParameter TPR = new EpubParameter(Site.RR, "The Perfect Run", "Maxime J. Durand", "2020-10-14", "en", @"https://www.royalroad.com/fiction/36735/the-perfect-run/chapter/569225/1-quicksave"); //----------------------------------------------------------------------------------------------------// readonly EpubParameter[] BOOKS = new[] { TPR }; readonly bool USE_WEBCACHE = true; readonly bool DO_LIVE_RELOAD_OF_LAST = true; readonly bool CONVERT_MOBI = true; readonly MainMode MODE = MainMode.Generate; //----------------------------------------------------------------------------------------------------// static EpubParameter ACTIVE_BOOK = null; const int LIMIT = 1500; readonly Regex REX_NUMSTART = new Regex(@"^\s*(?[0-9]+)\s*\-.*$", RegexOptions.Compiled); Dictionary webCache = new Dictionary(); string STASH_FOLDER => BASE_DIR_STASH + ACTIVE_BOOK.Foldername + @"\"; string WCACHE_FILE => BASE_DIR_OUT + @"_cache\" + ACTIVE_BOOK.Foldername + @".xml"; string HTML_FILE_OUT => BASE_DIR_OUT + @"html\" + ACTIVE_BOOK.Foldername + @".html"; string EPUB_FILE_OUT => BASE_DIR_OUT + @"epub\" + ACTIVE_BOOK.Foldername + @".epub"; string MOBI_FILE_OUT => BASE_DIR_OUT + @"mobi\" + ACTIVE_BOOK.Foldername + @".mobi"; string HTML_FILE_STASH => STASH_FOLDER + @"book.html"; string ZIP_FILE_STASH => STASH_FOLDER + @"book.zip"; string EPUB_FILE_STASH => STASH_FOLDER + @"book.epub"; string MOBI_FILE_STASH => STASH_FOLDER + @"book.mobi"; string QUERY_FOLDER => STASH_FOLDER + @"query\"; // full query result string HTML_FOLDER => STASH_FOLDER + @"html\"; // unprocessed chapter code string EPUB_FOLDER => STASH_FOLDER + @"epub\"; // processed epub chapter code //----------------------------------------------------------------------------------------------------// public enum MainMode { Generate, Verify, } public enum ProcessResult { SuccessNormal, ReachedEnd, SkipChapter, } public enum Site { Wordpress, WuxiaWorld, Royalroad, WP = Wordpress, WW = WuxiaWorld, RR = Royalroad, } public class Chapter { public string url; public string title; public string next; public GZippedString queryResult; public GZippedString sourcecode; public GZippedString chapter; public bool isPrologue; public bool isEpilogue; public bool isBonus; public bool isSpecial => isPrologue || isEpilogue || isBonus; } public class SerializableCacheEntry { public string URL; public GZippedString Content; } public class GZippedString : IXmlSerializable { public string Value { get; set; } public System.Xml.Schema.XmlSchema GetSchema() { return null; } public void ReadXml(System.Xml.XmlReader reader) { Value = DecompressString(reader.ReadString()); reader.ReadEndElement(); } public void WriteXml(System.Xml.XmlWriter writer) { writer.WriteString(CompressString(Value)); } private string CompressString(string text) { byte[] buffer = Encoding.UTF8.GetBytes(text); var memoryStream = new MemoryStream(); using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Compress, true)) gZipStream.Write(buffer, 0, buffer.Length); memoryStream.Position = 0; var compressedData = new byte[memoryStream.Length]; memoryStream.Read(compressedData, 0, compressedData.Length); var gZipBuffer = new byte[compressedData.Length + 4]; Buffer.BlockCopy(compressedData, 0, gZipBuffer, 4, compressedData.Length); Buffer.BlockCopy(BitConverter.GetBytes(buffer.Length), 0, gZipBuffer, 0, 4); return Convert.ToBase64String(gZipBuffer); } private string DecompressString(string compressedText) { byte[] gZipBuffer = Convert.FromBase64String(compressedText); using (var memoryStream = new MemoryStream()) { int dataLength = BitConverter.ToInt32(gZipBuffer, 0); memoryStream.Write(gZipBuffer, 4, gZipBuffer.Length - 4); var buffer = new byte[dataLength]; memoryStream.Position = 0; using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Decompress)) gZipStream.Read(buffer, 0, buffer.Length); return Encoding.UTF8.GetString(buffer); } } public static implicit operator GZippedString(string v) => new GZippedString{Value = v}; public static implicit operator string (GZippedString v) => v.Value; } public class Utf8StringWriter : StringWriter { public override Encoding Encoding { get { return Encoding.UTF8; } } } public class EpubParameter { public readonly string Series; public readonly int SeriesIndex; public readonly Guid ID_OPF; public readonly Guid ID_CAL; public readonly string Title; public readonly string Author; public readonly DateTime Release; public readonly string Language; public readonly string StartURL; public readonly string Foldername; public readonly Site SiteType; public string AuthorSort { get { return Author.Split(' ').Aggregate((a, b) => b + ", " + a); } } public EpubParameter(Site st, string t, string a, string r, string l, string s) : this(st, null, -1, t, a, r, l, s) { } public EpubParameter(Site st, string z, int i, string t, string a, string r, string l, string s) { SiteType = st; Series = z; SeriesIndex = i; Title = t; Author = a; Release = DateTime.ParseExact(r, "yyyy-MM-dd", CultureInfo.InvariantCulture); Language = l; StartURL = s; if (z == null) Foldername = Filenamify(t); else Foldername = string.Format("{0} {1} - {2}", Filenamify(z), i, Filenamify(t)); var u = new Random(Title.GetHashCode() ^ Author.GetHashCode()); var g = new byte[16]; u.NextBytes(g); ID_OPF = new Guid(g); u.NextBytes(g); ID_CAL = new Guid(g); } public String DisplayStr => (Series == null) ? $"{Title}" : $"{Series} {SeriesIndex} - {Title}"; } //----------------------------------------------------------------------------------------------------// void Main() { Util.AutoScrollResults = true; if (MODE == MainMode.Generate) Generate(); if (MODE == MainMode.Verify) Verify(); } void Generate() { foreach (var bb in BOOKS) { ACTIVE_BOOK = bb; $"".Dump(); $"".Dump(); $"".Dump(); new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump(); $" [PROCESSING BOOK] {bb.DisplayStr} ".Dump(); new string('=', $" [PROCESSING BOOK] {bb.DisplayStr} ".Length).Dump(); $"".Dump(); $"".Dump(); $"".Dump(); Init(); List chapters = FindChapters(); WriteBookHTML(chapters); WriteEpub(chapters); if (CONVERT_MOBI) GenerateMobi(); } } void Verify() { foreach (var bb in BOOKS) { ACTIVE_BOOK = bb; $"".Dump(); $"".Dump(); $"".Dump(); new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump(); $" [VERIFYING BOOK] {bb.DisplayStr} ".Dump(); new string('=', $" [VERIFYING BOOK] {bb.DisplayStr} ".Length).Dump(); $"".Dump(); $"".Dump(); $"".Dump(); LoadWebCache(); VerifyChapters(); } } void Init() { if (Directory.Exists(STASH_FOLDER)) { Directory.EnumerateDirectories(STASH_FOLDER).ToList().ForEach(d => Directory.EnumerateFiles(d).ToList().ForEach(File.Delete)); if (File.Exists(HTML_FILE_STASH)) File.Delete(HTML_FILE_STASH); if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH); if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH); if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH); } Directory.CreateDirectory(STASH_FOLDER); Directory.CreateDirectory(QUERY_FOLDER); Directory.CreateDirectory(HTML_FOLDER); Directory.CreateDirectory(EPUB_FOLDER); Directory.CreateDirectory(BASE_DIR_OUT + @"_cache\"); Directory.CreateDirectory(BASE_DIR_OUT + @"html\"); Directory.CreateDirectory(BASE_DIR_OUT + @"epub\"); Directory.CreateDirectory(BASE_DIR_OUT + @"mobi\"); if (USE_WEBCACHE) LoadWebCache(); } void WriteBookHTML(List chapters) { StringBuilder b = new StringBuilder(); b.AppendLine(""); b.AppendLine(""); b.AppendLine(""); foreach (var currChapter in chapters) { b.AppendLine(); b.AppendLine("

" + HtmlEntity.Entitize(currChapter.title) + "

"); b.AppendLine(); b.AppendLine(currChapter.chapter); } b.AppendLine(""); b.AppendLine(""); File.WriteAllText(HTML_FILE_STASH, b.ToString(), Encoding.UTF8); File.Copy(HTML_FILE_STASH, HTML_FILE_OUT, true); } void SaveCache() { var xs = new XmlSerializer(typeof(List)); using (var writer = new System.IO.StreamWriter(WCACHE_FILE)) { xs.Serialize(writer, webCache.Select(p => new SerializableCacheEntry { URL = p.Key, Content = new GZippedString{ Value = p.Value } }).ToList()); } } void LoadWebCache() { if (!File.Exists(WCACHE_FILE)) return; XmlSerializer deserializer = new XmlSerializer(typeof(List)); using (TextReader reader = new StreamReader(WCACHE_FILE)) { var result = new List(); var l = (List)deserializer.Deserialize(reader); webCache = l.ToDictionary(p => p.URL, p => p.Content.Value); } } List FindChapters() { List result = new List(); using (WebClient client = new WebClient()) { client.Encoding = Encoding.UTF8; Stack buffer = new Stack(); buffer.Push(ACTIVE_BOOK.StartURL); while (buffer.Any() && result.Count < LIMIT) { var url = buffer.Pop(); Chapter curr = new Chapter() { url = url }; var buffered = webCache.ContainsKey(url.ToLower()); if (buffered) { curr.queryResult = webCache[url.ToLower()]; "*(loaded from webcache)*".Dump(); } else { curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); webCache[url.ToLower()] = curr.queryResult; SaveCache(); } var r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url); if (next_url != null) buffer.Push(next_url); if (buffered && buffer.Count == 0 && DO_LIVE_RELOAD_OF_LAST) { "".Dump(); "//==> *(auto-reload from live)*".Dump(); "".Dump(); curr.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); webCache[url.ToLower()] = curr.queryResult; SaveCache(); r = ProcessChapter(curr, result, s=>s.Dump(), out var next_url_inner); if (next_url_inner != null) buffer.Push(next_url_inner); } if (r == ProcessResult.SuccessNormal) { " ==> Chapter processed".Dump(); result.Add(curr); OutputChapter(curr, result.Count); } else if (r == ProcessResult.SkipChapter) { " ==> Skip this chapter".Dump(); } else if (r == ProcessResult.ReachedEnd) { " ==> End reached".Dump(); } "".Dump(); } } return result; } void VerifyChapters() { List result = new List(); using (WebClient client = new WebClient()) { client.Encoding = Encoding.UTF8; Stack buffer = new Stack(); buffer.Push(ACTIVE_BOOK.StartURL); while (buffer.Any() && result.Count < LIMIT) { var url = buffer.Pop(); Chapter curr_buffer = new Chapter() { url = url }; Chapter curr_live = new Chapter() { url = url }; var buffered = webCache.ContainsKey(url.ToLower()); if (buffered) { try { curr_buffer.queryResult = webCache[url.ToLower()]; curr_live.queryResult = client.DownloadString(Uri.UnescapeDataString(url)); } catch (Exception e) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Live reload resulted in exception: {e.Message}".Dump(); continue; } } else { continue; } var is_diff = false; var r_buffer = ProcessChapter(curr_buffer, result, _ => {}, out var next_buffer); var r_live = ProcessChapter(curr_live, result, _ => {}, out var next_live); if (next_buffer != null) buffer.Push(next_buffer); if (r_buffer != r_live) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different Process result: {r_buffer} <> {r_live}".Dump(); is_diff = true; } if (r_buffer != r_live) {$"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different push URL: {next_buffer} <> {next_live}".Dump(); is_diff = true; } if (!Relaxedurleq(curr_buffer.next, curr_live.next)) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different next chapter: {curr_buffer.next} <> {curr_live.next}".Dump(); is_diff = true; } if (curr_buffer.title != curr_live.title) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different title: {curr_buffer.title} <> {curr_live.title}".Dump(); is_diff = true; } if (curr_buffer.chapter.Value != curr_live.chapter.Value) { var clean_buffer = GetChapterText(curr_buffer); var clean_live = GetChapterText(curr_live); if (clean_buffer.Trim() != clean_live.Trim()) { $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] Different content: ".Dump(); new Hyperlinq(() => { var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); File.WriteAllText(fa, curr_buffer.chapter.Value); File.WriteAllText(fb, curr_live.chapter.Value); Process.Start(COMPARE_PROG, $"\"{fa}\" \"{fb}\""); }, "[Compare Raw]").Dump(); new Hyperlinq(() => { var fa = Path.Combine(Path.GetTempPath(), "buffer_" + Guid.NewGuid() + ".txt"); var fb = Path.Combine(Path.GetTempPath(), "live___" + Guid.NewGuid() + ".txt"); File.WriteAllText(fa, clean_buffer); File.WriteAllText(fb, clean_live); Process.Start(COMPARE_PROG, $"\"{fa}\" \"{fb}\""); }, "[Compare Text]").Dump(); new Hyperlinq(() => { webCache[url.ToLower()] = curr_live.queryResult; SaveCache(); }, "[Save new version to webcache]").Dump(); is_diff = true; } } if (!is_diff) $"[{ACTIVE_BOOK.DisplayStr} | {curr_buffer.title ?? url}] OK - No differences".Dump(); if (is_diff) "".Dump(); } } } bool Relaxedurleq(string a, string b) { if (a == b) return true; if (a.StartsWith("https://")) a = a.Substring("https://".Length); if (a.StartsWith("http://")) a = a.Substring("http://".Length); if (b.StartsWith("https://")) b = b.Substring("https://".Length); if (b.StartsWith("http://")) b = b.Substring("http://".Length); return (a==b); } string GetChapterText(Chapter c) { if (string.IsNullOrWhiteSpace(c.chapter.Value)) return string.Empty; var clean = HTMLToText.ConvertHtml(c.chapter.Value); clean = clean.Trim(); clean = new Regex(@"\s+").Replace(clean, " "); return clean; } ProcessResult ProcessChapter(Chapter curr, IReadOnlyList backBuffer, Action prt, out string forwardQueue_next) { forwardQueue_next = null; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(curr.queryResult); #region Base var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]"); if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter')]//div[contains(@class ,'portlet-body')]"); if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.WW) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'box_con')]"); var nodeNav = doc.DocumentNode.SelectSingleNode(@"//nav[contains(@class,'post-navigation') and @role='navigation']"); if (nodeNav == null) nodeNav = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class,'pjgm-navigation')]"); if (nodeNav == null) nodeNav = nodeContent.SelectSingleNode(@"//div[contains(@class,'nav-buttons')]"); if (nodeNav == null) nodeNav = nodeContent; var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]"); if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]"); #endregion #region Title var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']"); if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]"); if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1"); if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong"); if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WW) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'bookname')]/h1"); curr.title = TitleFmt(HtmlEntity.DeEntitize(titleNode.InnerText)); var titles = new List(); titles.Add(curr.title); if (string.IsNullOrWhiteSpace(curr.title) || Regex.IsMatch(curr.title.ToLower(), @"^chapter [0-9]+.*")) { var baseTitle = curr.title; var suffix = TitleFmt(Regex.Match(curr.title.ToLower(), @"^chapter [0-9]+(.*)$").Groups[1].Value); var prefix1 = Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[0].Value; var prefix2 = "chapter " + int.Parse(Regex.Match(curr.title.ToLower(), @"^(chapter) ([0-9]+)").Groups[2].Value); titles.Add(prefix1); titles.Add(prefix2); var altTitleNode1 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2); var altTitleNode2 = nodeChapter.Descendants().LastOrDefault(p => !p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2); var altTitleNode3 = nodeChapter.Descendants().FirstOrDefault(p => p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix1) && p.InnerText.Trim().Length - prefix1.Length > 2 && !(p.InnerHtml.Contains("

") || p.InnerHtml.Contains(" p.HasChildNodes && p.InnerText.Trim().ToLower().StartsWith(prefix2) && p.InnerText.Trim().Length - prefix2.Length > 2 && !(p.InnerHtml.Contains("

") || p.InnerHtml.Contains(" title node removed"); } else if (altTitleNode4 != null) { var newtitle = TitleFmt(altTitleNode4.InnerText.Trim().Substring(prefix2.Length)); titles.Add(newtitle); curr.title = newtitle; titles.Add(prefix1 + newtitle); titles.Add(prefix2 + newtitle); titles.Add(prefix1 + " - " + newtitle); titles.Add(prefix2 + " - " + newtitle); altTitleNode4.Remove(); prt(" > title node removed"); } else if (suffix.Length > 2) { curr.title = suffix; titles.Add(suffix); } else { prt(" [!!] Warning cannot parse title"); } if (suffix.Length > 2) { curr.title = baseTitle; titles.Add(baseTitle); } } if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) { var tit_alt = curr.title.Substring(ACTIVE_BOOK.Foldername.Length); while (tit_alt.Length > 0 && new[] {' ', '\t', '-', ',', ':', '.', '_', ';'}.Contains(tit_alt[0])) tit_alt = tit_alt.Substring(1); tit_alt = tit_alt.Trim(); if (tit_alt.Length>2) curr.title = tit_alt; } #endregion curr.sourcecode = "\r\n\r\n\r\n" + nodeContent.OuterHtml + "\r\n\r\n\r\n"; if (backBuffer.Any() && backBuffer.First().title == curr.title) { prt("[!] Book loop found - skipping entry"); return ProcessResult.ReachedEnd; // prevent book II loop } curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad); curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog"))); curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus"))); if (ACTIVE_BOOK == APGTE7) curr.isEpilogue = titles.Any(t => t.ToLower() == "epilogue II"); if (backBuffer.Skip(1).Any(bb => bb.isEpilogue) && !curr.isBonus) { prt("[!] Epilogue found - skipping entry"); return ProcessResult.ReachedEnd; // Book finished - it was the Epilogue } prt(curr.title + " (" + curr.url + ")"); #region Next string[] title_spec_words = new string[] {"prologue", "epilogue", "bonus" }; if (backBuffer.Where(b => !b.isSpecial).Count() > 4 && backBuffer.Where(b => !b.isSpecial).Select(bb => { var r = REX_NUMSTART.Match(bb.title); return r.Success ? r.Groups["n"].Value : null; }).Distinct().Count() == 1 && REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Success && REX_NUMSTART.Match(curr.title).Success && REX_NUMSTART.Match(backBuffer.Where(b => !b.isSpecial).First().title).Groups["n"].Value != REX_NUMSTART.Match(curr.title).Groups["n"].Value) { prt("[!] Book jump found - skipping entry"); return ProcessResult.ReachedEnd; } var next = nodeContent.SelectSingleNode(@"//div[@class='entry-content']//a[normalize-space(@title)='Next Chapter' or normalize-space(text())='Next Chapter']"); if (next == null) next = nodeContent.Descendants() .Where(p => p.Name.ToLower() == "a") .Where(p => Striptease(p) == "next chapter" || Striptease(p) == "next") .Where(p => p.Attributes.Contains("href")) .FirstOrDefault(); var x = nodeContent.Descendants().Where(p => p.Name.ToLower() == "a"); if (next == null) next = nodeNav.Descendants() .Where(p => p.Name.ToLower() == "a") .Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next")) .FirstOrDefault(); if (next != null) { var next_url = next.Attributes["href"].Value.Trim(); if (next_url == "." || next_url == "/" || next_url == "./") { next=null; } else { if (next_url.StartsWith("//")) next_url = "http:" + next_url; if (next_url.StartsWith("/")) next_url = combineAuthority(curr.url, next_url); if (!next_url.Contains("://") && ACTIVE_BOOK.SiteType == Site.WW) next_url = CombineUri(curr.url, next_url); curr.next = next_url; if (!backBuffer.Any(p => p.url.ToLower() == next_url.ToLower())) { forwardQueue_next = next_url; } } } if (next == null) prt(" > (!) No next URL found"); #endregion #region Chapter marker var cpMarkerIdentities = new List { "previousnext", "previouschapternextchapter", "firstnext", "firstchapternextchapter", "firstchapter", "previouslast", "previouschapterlastchapter", "previouschapter", "nextchapter", "lastchapter", "first", "previous", "next", "last" }; foreach (var node in nodeChapter.ChildNodes.Where(p =>p.InnerText.Trim().Length < 24 && (p.InnerText.ToLower().Contains("previous chapter") || p.InnerText.ToLower().Contains("next chapter") || p.InnerText.ToLower().Contains("last chapter") || p.InnerText.ToLower().Contains("first chapter"))).ToList()) { nodeChapter.RemoveChild(node); prt(" > Chapter marker removed"); } foreach (var node in nodeChapter.ChildNodes.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) { nodeChapter.RemoveChild(node); prt(" > Chapter marker removed"); } var alist = nodeChapter.SelectNodes("//a"); if (alist != null) { foreach (var node in alist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) { node.Remove(); prt(" > Chapter marker removed"); } } var plist = nodeChapter.SelectNodes("//p"); if (plist != null) { foreach (var node in plist.Where(p => cpMarkerIdentities.Any(m => NakedIdentity(p) == m)).ToList()) { node.Remove(); prt(" > Chapter marker removed"); } } #endregion #region Share Div var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]"); if (shareNodes != null) { foreach (var node in shareNodes) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > share div removed"); } else { prt(" > share div cannot be removed - skipping"); } } } #endregion #region Meta Div var metaNodes = nodeChapter.SelectNodes(@"div[contains(@class, 'entry-meta')]"); if (metaNodes != null) { foreach (var node in metaNodes) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > meta div removed"); } else { prt(" > meta div cannot be removed - skipping"); } } } #endregion #region Ad Blocking var adNodes1 = nodeChapter.SelectNodes(@"div[contains(@class,'wpcnt')]/div[contains(@class,'wpa')]/.."); if (adNodes1 != null) { foreach (var node in adNodes1) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > ad div removed"); } else { prt(" > ad div cannot be removed - skipping"); } } } var adNodes2 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block') or contains(@class,'ai-desktop-tablet')]/script/.."); if (adNodes2 != null) { foreach (var node in adNodes2) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > ad div removed"); } else { prt(" > ad div cannot be removed - skipping"); } } } var adNodes3 = nodeChapter.SelectNodes(@"div[contains(@class,'code-block')]"); if (adNodes3 != null) { foreach (var node in adNodes3.Where(n => Striptease(n) == "advertisement")) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > ad div removed"); } else { prt(" > ad div cannot be removed - skipping"); } } } #endregion #region Title Paragraphs var titleNodes1 = nodeChapter.SelectNodes(@"p"); if (titleNodes1 != null && titleNodes1.Any() && titles.Any(t => t.ToLower() == TitleFmt(titleNodes1.First().InnerText).ToLower()) && nodeChapter.ChildNodes.Contains(titleNodes1.First())) { nodeChapter.RemoveChild(titleNodes1.First()); prt(" > title node removed"); } for (int hval = 1; hval <= 5; hval++) { var titleNodes2 = nodeChapter.SelectNodes(@"h" + hval); if (titleNodes2 != null) { foreach (var node in titleNodes2.Where(node => titles.Any(t => t.ToLower() == TitleFmt(node.InnerText).ToLower()))) { if (nodeChapter.ChildNodes.Contains(node)) { nodeChapter.RemoveChild(node); prt(" > title node removed"); } } } } var titleNodes3 = nodeChapter.SelectNodes(@"//u"); if (titleNodes3 != null && titleNodes3.Any()) { var xTitleNodes3 = titleNodes3.Where(n => titles.Any(t => CouldBeTitle(n, t))); foreach (var t in xTitleNodes3) { t.Remove(); prt(" > title node removed"); } } var titleNodes4 = nodeChapter.SelectNodes(@"//span"); if (titleNodes4 != null && titleNodes4.Any()) { var xTitleNodes4 = titleNodes4.Where(n => titles.Any(t => CouldBeTitle(n, t))); foreach (var t in xTitleNodes4) { t.Remove(); prt(" > title node removed"); } } var titleNodes5 = nodeChapter.SelectNodes(@"//strong"); if (titleNodes5 != null && titleNodes5.Any()) { var xTitleNodes5 = titleNodes5.Where(n => titles.Any(t => CouldBeTitle(n, t))); foreach (var t in xTitleNodes5) { t.Remove(); prt(" > title node removed"); } } #endregion #region Remove


's while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First().Name.ToLower() == "hr") { nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).First()); prt(" > header hr removed"); } while (nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last().Name.ToLower() == "hr") { nodeChapter.RemoveChild(nodeChapter.ChildNodes.Where(p => p.NodeType == HtmlNodeType.Element).Last()); prt(" > footer hr removed"); } #endregion #region Other (Author's Node) foreach (var node in nodeChapter.ChildNodes.Where(p => p.InnerText.ToLower().Contains("note from the author")).ToList()) { nodeChapter.RemoveChild(node); prt(" > authors note removed"); } #endregion var chap_html = nodeChapter.InnerHtml.Trim(); #region Fix raw
// KOReader doesn't like
chap_html = chap_html.Replace("
", "
"); #endregion curr.chapter = chap_html; if (curr.title.ToLower().StartsWith("not a chapter - ")) return ProcessResult.SkipChapter; return ProcessResult.SuccessNormal; } string combineAuthority(string url, string suffix) { var left = new Uri(url).GetLeftPart(UriPartial.Authority); if (!left.EndsWith("/")) left = left + "/"; if (suffix.StartsWith("/")) suffix = suffix.TrimStart('/'); return left + suffix; } string CombineUri(string uri1, string uri2) { if (uri1.Contains("/")) uri1 = uri1.Substring(0, uri1.LastIndexOf("/")); uri1 = uri1.TrimEnd('/'); uri2 = uri2.TrimStart('/'); return string.Format("{0}/{1}", uri1, uri2); } void OutputChapter(Chapter curr, int index) { File.WriteAllText(QUERY_FOLDER + string.Format("{0:000}", index) + "_" + Filenamify(curr.title) + ".html", curr.queryResult); File.WriteAllText(HTML_FOLDER + string.Format("{0:000}", index) + "_" + Filenamify(curr.title) + ".html", curr.sourcecode, Encoding.UTF8); StringBuilder b = new StringBuilder(); { b.AppendLine(""); b.AppendLine(""); b.AppendLine(""); b.AppendLine(); b.AppendLine("

" + HtmlEntity.Entitize(curr.title) + "

"); b.AppendLine(); b.AppendLine(curr.chapter); b.AppendLine(""); b.AppendLine(""); } File.WriteAllText(Path.Combine(EPUB_FOLDER, Filenamify(string.Format("{0:000}_{1}.html", index, curr.title))), b.ToString(), Encoding.UTF8); } static string Filenamify(string v, bool repl = false) { var s = new String(v.Replace((char)160, ' ').ToCharArray().Where(p => (p >= '0' && p <= '9') || (p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == ' ' || p == '.' || p == '-' || p == '*' || p == '_' || p == '.' || p == ',').ToArray()); if (repl) s = s.Replace(' ', '_'); return s; } string TitleFmt(string raw) { raw = HtmlEntity.DeEntitize(raw); raw = raw.Replace('–', '-'); raw = raw.Replace((char)160, ' '); raw = raw.Trim().Trim('-', ':', '_', '#').Trim(); if (raw.ToLower().StartsWith("tde")) raw = raw.Substring(3); raw = raw.Trim().Trim('-', ':', '_', '#').Trim(); if (raw.Length >= 2) raw = char.ToUpper(raw[0]) + raw.Substring(1); return raw; } string Striptease(HtmlNode raw) { { var rm = raw.SelectNodes(@"//script"); if (rm != null && rm.Any()) { var copy = HtmlNode.CreateNode($"<{raw.Name}>"); copy.CopyFrom(raw); raw = copy; rm = raw.SelectNodes(@"//script"); if (rm != null) foreach (var e in rm) e.Remove(); } } { var rm = raw.SelectNodes(@"//meta"); if (rm != null && rm.Any()) { var copy = HtmlNode.CreateNode($"<{raw.Name}>"); copy.CopyFrom(raw); raw = copy; rm = raw.SelectNodes(@"//meta"); if (rm != null) foreach (var e in rm) e.Remove(); } } return Striptease(HtmlEntity.DeEntitize(raw.InnerText)); } string Striptease(string raw) { var r = string.Join(string.Empty, raw .ToCharArray() .Select(c => char.IsWhiteSpace(c) ? ' ' : c) .Where(c => char.IsLetterOrDigit(c) ||char.IsWhiteSpace(c)) .Select(c => char.ToLower(c))).Trim(); return r; } string NakedIdentity(HtmlNode raw) { return string.Join(string.Empty, raw .InnerText .ToLower() .Replace(">", "") .Replace("<", "") .Replace("&", "") .Replace(""", "") .Replace(" ", "") .ToCharArray() .Where(c => char.IsLetterOrDigit(c)) .Select(c => char.ToLower(c))).Trim() .ToLower(); } bool CouldBeTitle(HtmlNode n, string title) { var t0 = Striptease(n); var t1 = Striptease(title); t0 = t0.ToLower(); t1 = t1.ToLower(); t0 = t0.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", ""); t1 = t1.Replace(":", "").Replace("-", "").Replace("(", "").Replace(")", ""); t0 = Regex.Replace(t0, @"\s\s+", ""); t1 = Regex.Replace(t1, @"\s\s+", ""); return t0 == t1; } void WriteEpub(List chapters) { if (File.Exists(EPUB_FILE_STASH)) File.Delete(EPUB_FILE_STASH); if (File.Exists(ZIP_FILE_STASH)) File.Delete(ZIP_FILE_STASH); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); using (FileStream fs = File.Open(ZIP_FILE_STASH, FileMode.Create, FileAccess.ReadWrite)) { using (var zipbook = new ZipOutputStream(fs)) { WritePubString(zipbook, @"mimetype", GetEpubMimetype()); WritePubString(zipbook, @"META-INF\container.xml", GetEpubContainerXML()); WritePubString(zipbook, @"OEBPS\content.opf", GetEpubContentOPF(chapters)); WritePubString(zipbook, @"OEBPS\toc.ncx", GetEpubTOC(chapters)); for (int i = 0; i < chapters.Count; i++) { WritePubString(zipbook, string.Format(@"OEBPS\Text\{0:000}_{1}.html", i + 1, Filenamify(chapters[i].title, true)), GetEpubChapterFile(chapters[i], i)); } } } File.Copy(ZIP_FILE_STASH, EPUB_FILE_STASH); File.Copy(EPUB_FILE_STASH, EPUB_FILE_OUT, true); } void GenerateMobi() { if (File.Exists(MOBI_FILE_STASH)) File.Delete(MOBI_FILE_STASH); "Running ebook-convert for MOBI output".Dump(); var pout = ProcessHelper.ProcExecute("ebook-convert", $"\"{EPUB_FILE_STASH}\" \"{MOBI_FILE_STASH}\" --use-auto-toc --level1-toc=\"//h:h1\" --max-toc-links=0 --toc-threshold=9999"); $"ebook-convert returned: {pout.ExitCode}".Dump(); if (pout.ExitCode != 0) throw new Exception(pout.ExitCode + "\n\n\n\n" + pout.StdCombined); File.Copy(MOBI_FILE_STASH, MOBI_FILE_OUT, true); } void WritePubString(ZipOutputStream z, string n, string c, Encoding e = null) { e = e ?? Encoding.UTF8; var f = z.PutNextEntry(n); f.CompressionLevel = Ionic.Zlib.CompressionLevel.None; byte[] buffer = e.GetBytes(c); z.Write(buffer, 0, buffer.Length); } string GetEpubMimetype() { return "application/epub+zip"; } string GetEpubContainerXML() { var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null), new XElement(XName.Get("container", "urn:oasis:names:tc:opendocument:xmlns:container"), new XAttribute("version", "1.0"), new XElement(XName.Get("rootfiles", "urn:oasis:names:tc:opendocument:xmlns:container"), new XElement(XName.Get("rootfile", "urn:oasis:names:tc:opendocument:xmlns:container"), new XAttribute("full-path", "OEBPS/content.opf"), new XAttribute("media-type", "application/oebps-package+xml"))))); StringBuilder builder = new StringBuilder(); using (Utf8StringWriter writer = new Utf8StringWriter()) { doc.Save(writer); var r = writer.ToString(); r = r.Replace("encoding=\"utf-8\"", "encoding=\"UTF-8\""); return r.Trim() + "\r\n"; } } string GetEpubContentOPF(List chapters) { XNamespace dc = "http://purl.org/dc/elements/1.1/"; XNamespace opf = "http://www.idpf.org/2007/opf"; var doc = new XDocument(new XDeclaration("1.0", "UTF-8", null)); var package = new XElement(opf + "package", new XAttribute("unique-identifier", "BookId"), new XAttribute("version", "2.0")); doc.Add(package); var meta = new XElement(opf + "metadata", new XAttribute(XNamespace.Xmlns + "dc", dc), new XAttribute(XNamespace.Xmlns + "opf", opf), new XElement(dc + "title", ACTIVE_BOOK.Title), new XElement(dc + "creator", ACTIVE_BOOK.Author), new XElement(dc + "identifier", new XAttribute("id", "BookId"), new XAttribute(opf + "scheme", "UUID"), "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")), new XElement(dc + "date", new XAttribute(opf + "event", "publication"), ACTIVE_BOOK.Release.ToString("yyyy'-'MM'-'dd")), new XElement(dc + "date", new XAttribute(opf + "event", "modification"), DateTime.Now.ToString("yyyy'-'MM'-'dd")), new XElement(dc + "date", new XAttribute(opf + "event", "creation"), DateTime.Now.ToString("yyyy'-'MM'-'dd")), new XElement(dc + "language", ACTIVE_BOOK.Language), new XElement(dc + "identifier", new XAttribute(opf + "scheme", "UUID"), ACTIVE_BOOK.ID_CAL.ToString("D")), new XElement(opf + "meta", new XAttribute("content", "1.0"), new XAttribute("name", "Wordpress_eBook_scraper_version")), new XElement(opf + "meta", new XAttribute("content", DateTime.Now.ToString("yyyy-MM-dd")), new XAttribute("name", "Wordpress_eBook_scraper_creation_time"))); if (ACTIVE_BOOK.Series != null) { meta.Add(new XElement(opf + "meta", new XAttribute("content", ACTIVE_BOOK.Series), new XAttribute("name", "calibre:series"))); meta.Add(new XElement(opf + "meta", new XAttribute("content", string.Format("{0}.0", ACTIVE_BOOK.SeriesIndex)), new XAttribute("name", "calibre:series_index"))); } package.Add(meta); var manifest = new XElement(opf + "manifest"); for(int i = 0; i < chapters.Count; i++) { manifest.Add(new XElement(opf + "item", new XAttribute("href", string.Format("Text/{0:000}_{1}.html", i+1, Uri.EscapeUriString(Filenamify(chapters[i].title, true)))), new XAttribute("id", string.Format("x{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true))), new XAttribute("media-type", "application/xhtml+xml"))); } manifest.Add(new XElement(opf + "item", new XAttribute("href", "toc.ncx"), new XAttribute("id", "ncx"), new XAttribute("media-type", "application/x-dtbncx+xml"))); package.Add(manifest); var spine = new XElement(opf + "spine", new XAttribute("toc", "ncx")); for (int i = 0; i < chapters.Count; i++) { spine.Add(new XElement(opf + "itemref", new XAttribute("idref", string.Format("x{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true))))); } package.Add(spine); package.Add(new XElement(opf + "guide")); StringBuilder builder = new StringBuilder(); using (Utf8StringWriter writer = new Utf8StringWriter()) { doc.Save(writer); return writer.ToString(); } } string GetEpubTOC(List chapters) { XNamespace dc = "http://www.daisy.org/z3986/2005/ncx/"; XNamespace ncx = "http://www.idpf.org/2007/opf"; var doc = new XDocument( new XDeclaration("1.0", "UTF-8", null), new XDocumentType("ncx", "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd", null)); var root = new XElement(ncx + "ncx", new XAttribute("version", "2005-1"), new XElement(ncx + "head", new XElement(ncx + "meta", new XAttribute("content", "urn:uuid:" + ACTIVE_BOOK.ID_OPF.ToString("D")), new XAttribute("name", "dtb:uid")), new XElement(ncx + "meta", new XAttribute("content", 1), new XAttribute("name", "dtb:depth")), new XElement(ncx + "meta", new XAttribute("content", 0), new XAttribute("name", "dtb:totalPageCount")), new XElement(ncx + "meta", new XAttribute("content", 0), new XAttribute("name", "dtb:maxPageNumber")))); doc.Add(root); root.Add(new XElement(ncx + "docTitle", new XElement(ncx + "text", "Unknown"))); var nav = new XElement(ncx + "navMap"); for (int i = 0; i < chapters.Count; i++) { nav.Add(new XElement(ncx + "navPoint", new XAttribute("id", "navPoint-" + (i + 1)), new XAttribute("playOrder", i + 1), new XElement(ncx + "navLabel", new XElement(ncx + "text", chapters[i].title)), new XElement(ncx + "content", new XAttribute("src", string.Format("Text/{0:000}_{1}.html", i+1, Filenamify(chapters[i].title, true)))))); } root.Add(nav); StringBuilder builder = new StringBuilder(); using (Utf8StringWriter writer = new Utf8StringWriter()) { doc.Save(writer); return writer.ToString(); } } string GetEpubChapterFile(Chapter chapter, int idx) { StringBuilder xml = new StringBuilder(); xml.AppendLine(@""); xml.AppendLine(@" "); xml.AppendLine(@""); xml.AppendLine(@""); xml.AppendLine("" + HtmlEntity.Entitize(chapter.title) + ""); xml.AppendLine(@""); xml.AppendLine(@""); xml.AppendLine("

" + HtmlEntity.Entitize(chapter.title) + "

"); xml.AppendLine(chapter.chapter); xml.AppendLine(@""); xml.AppendLine(@""); return xml.ToString(); } public struct ProcessOutput { public readonly string Command; public readonly int ExitCode; public readonly string StdOut; public readonly string StdErr; public readonly string StdCombined; public ProcessOutput(string cmd, int ex, string stdout, string stderr, string stdcom) { Command = cmd; ExitCode = ex; StdOut = stdout; StdErr = stderr; StdCombined = stdcom; } public override string ToString() => $"{Command}\n=> {ExitCode}\n\n[stdout]\n{StdOut}\n\n[stderr]\n{StdErr}"; } public static class ProcessHelper { public static ProcessOutput ProcExecute(string command, string arguments, string workingDirectory = null) { var process = new Process { StartInfo = { FileName = command, Arguments = arguments, WorkingDirectory = workingDirectory ?? string.Empty, UseShellExecute = false, RedirectStandardOutput = true, RedirectStandardError = true, CreateNoWindow = true, ErrorDialog = false, } }; var builderOut = new StringBuilder(); var builderErr = new StringBuilder(); var builderBoth = new StringBuilder(); process.OutputDataReceived += (sender, args) => { if (args.Data == null) return; if (builderOut.Length == 0) builderOut.Append(args.Data); else builderOut.Append("\n" + args.Data); if (builderBoth.Length == 0) builderBoth.Append(args.Data); else builderBoth.Append("\n" + args.Data); }; process.ErrorDataReceived += (sender, args) => { if (args.Data == null) return; if (builderErr.Length == 0) builderErr.Append(args.Data); else builderErr.Append("\n" + args.Data); if (builderBoth.Length == 0) builderBoth.Append(args.Data); else builderBoth.Append("\n" + args.Data); }; process.Start(); process.BeginOutputReadLine(); process.BeginErrorReadLine(); process.WaitForExit(); return new ProcessOutput($"{command} {arguments.Replace("\r", "\\r").Replace("\n", "\\n")}", process.ExitCode, builderOut.ToString(), builderErr.ToString(), builderBoth.ToString()); } } public static class HTMLToText { private static Regex REX_TAG1 = new Regex("<\\s*(link|style|script)[^>]*?/>", RegexOptions.Compiled); private static Regex REX_TAG2 = new Regex("<\\s*(link|style|script)[^>]*?>[^<>]*?<\\/\\s*\\1\\s*>", RegexOptions.Compiled); private class PreceedingDomTextInfo { public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten) { IsFirstTextOfDocWritten = isFirstTextOfDocWritten; } public bool WritePrecedingWhiteSpace { get; set; } public bool LastCharWasSpace { get; set; } public readonly BoolWrapper IsFirstTextOfDocWritten; public int ListIndex { get; set; } } private class BoolWrapper { public BoolWrapper() { } public bool Value { get; set; } public static implicit operator bool(BoolWrapper boolWrapper) { return boolWrapper.Value; } public static implicit operator BoolWrapper(bool boolWrapper) { return new BoolWrapper { Value = boolWrapper }; } } public static string Convert(string path) { HtmlDocument doc = new HtmlDocument(); doc.Load(path); return ConvertDoc(doc); } public static string ConvertHtml(string html) { HtmlDocument doc = new HtmlDocument(); html = REX_TAG1.Replace(html, " "); html = REX_TAG2.Replace(html, " "); doc.LoadHtml(html); return ConvertDoc(doc); } public static string ConvertDoc(HtmlDocument doc) { using (StringWriter sw = new StringWriter()) { ConvertTo(doc.DocumentNode, sw); sw.Flush(); return sw.ToString(); } } private static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) { foreach (HtmlNode subnode in node.ChildNodes) { ConvertTo(subnode, outText, textInfo); } } public static void ConvertTo(HtmlNode node, TextWriter outText) { ConvertTo(node, outText, new PreceedingDomTextInfo(false)); } private static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) { string html; switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(node, outText, textInfo); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if ((parentName == "script") || (parentName == "style")) { break; } // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) break; // check the text is meaningful and not a bunch of whitespaces if (html.Length == 0) break; if (html.Trim().ToLower().StartsWith("")) break; if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace) { html = html.TrimStart(); if (html.Length == 0) { break; } textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true; } outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " "))); if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])) { outText.Write(' '); } break; case HtmlNodeType.Element: string endElementString = null; bool isInline; bool skip = false; int listIndex = 0; switch (node.Name) { case "nav": skip = true; isInline = false; break; case "body": case "section": case "article": case "aside": case "h1": case "h2": case "header": case "footer": case "address": case "main": case "div": case "span": case "p": // stylistic - adjust as you tend to use if (textInfo.IsFirstTextOfDocWritten) outText.Write("\r\n"); endElementString = "\r\n"; isInline = false; break; case "br": outText.Write("\r\n"); skip = true; textInfo.WritePrecedingWhiteSpace = false; isInline = true; break; case "a": isInline = true; break; case "li": isInline = false; break; case "ol": listIndex = 1; goto case "ul"; case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems endElementString = "\r\n"; isInline = false; break; case "img": //inline-block in reality isInline = true; break; default: isInline = true; break; } if (!skip && node.HasChildNodes) { ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex }); } if (endElementString != null) { outText.Write(endElementString); } break; } } }