1
0

Quickpush-commit from 2023-11-12 22:25:52

This commit is contained in:
2023-11-12 22:25:52 +01:00
parent ff68d714ee
commit 4253113c39
3 changed files with 65 additions and 30 deletions

View File

@@ -354,7 +354,9 @@ public class Scraper
#region Base
var nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
HtmlNode nodeContent = null;
if (nodeContent == null && ACTIVE_BOOK.SiteType == Site.AO3) nodeContent = doc.DocumentNode.SelectSingleNode(@"//*[@id = 'workskin']");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@class,'post') and contains(@class ,'type-post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//article[contains(@id,'post') and contains(@class ,'post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@id,'post') and contains(@class ,'post')]");
if (nodeContent == null) nodeContent = doc.DocumentNode.SelectSingleNode(@"//div[contains(@class ,'chapter') and not(contains(@class ,'chapter-page'))]//div[contains(@class ,'portlet-body')]");
@@ -367,12 +369,16 @@ public class Scraper
var nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content') or contains(@class, 'postcontent') or contains(@class, 'post-content') or contains(@class, 'chapter-content')]");
if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.WW) nodeChapter = nodeContent.SelectSingleNode(@"//div[contains(@id, 'content')]");
if (nodeChapter == null && ACTIVE_BOOK.SiteType == Site.AO3) nodeChapter = nodeContent.SelectSingleNode(@"//*[@id = 'chapters']");
#endregion
#region Title
var titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
HtmlNode titleNode = null;
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.AO3) titleNode = nodeContent.SelectSingleNode(@"//h3[contains(@class, 'title')]");
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.AO3) titleNode = nodeContent.SelectSingleNode(@"//h2[contains(@class, 'title')]");
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//header[@class='entry-header']//h1[@class='entry-title']");
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//h1[contains(@class, 'posttitle')]");
if (titleNode == null) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'fic-header')]//h1");
if (titleNode == null && ACTIVE_BOOK.SiteType == Site.WP) titleNode = nodeContent.SelectSingleNode(@"//div[contains(@class, 'entry-content')]//strong");
@@ -454,12 +460,6 @@ public class Scraper
{
prt(" [!!] Warning cannot parse title");
}
if (suffix.Length > 2)
{
curr.title = baseTitle;
titles.Add(baseTitle);
}
}
if (curr.title.ToLower().StartsWith(ACTIVE_BOOK.Foldername.ToLower())) {
@@ -479,7 +479,7 @@ public class Scraper
return ProcessResult.ReachedEnd; // prevent book II loop
}
curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad);
curr.isEpilogue = (titles.Any(t => t.ToLower().Contains("epilogue") || t.ToLower().Contains("epilog"))) && (ACTIVE_BOOK.SiteType!=Site.Royalroad) && (ACTIVE_BOOK!=Config.WI);
curr.isPrologue = (titles.Any(t => t.ToLower().Contains("prologue") || t.ToLower().Contains("prolog")));
curr.isBonus = (titles.Any(t => t.ToLower().Trim().StartsWith("bonus")));
@@ -533,8 +533,9 @@ public class Scraper
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
.FirstOrDefault();
if (next == null && ACTIVE_BOOK.Title == "Pale")
if (next == null && ACTIVE_BOOK == Config.PALE)
{
// some chapters in Pale miss the anchor tags on the next-chapter elem -.-
var nextLS = Helper.RecursiveDescendants(doc.DocumentNode)
.Where(p => p.Name.ToLower() == "a")
.Where(p => p.Attributes.Any(q => q.Name == "rel" && q.Value == "next"))
@@ -543,7 +544,7 @@ public class Scraper
if (nextLS.Count == 1) next = nextLS.Single().FirstOrDefault();
}
if (next != null && next.Attributes["href"].Value.Trim() == "(https://palewebserial.wordpress.com/2023/10/10/end/") next = null; // do not process author-notes from Pale
if (next != null && next.Attributes["href"].Value.Trim() == "https://palewebserial.wordpress.com/2023/10/10/end/") next = null; // do not process author-notes from Pale
if (next != null)
{
@@ -623,6 +624,27 @@ public class Scraper
#endregion
#region A03 Stuff
var ao3workNodes = nodeChapter.SelectNodes(@"//*[@id = 'work']");
if (ao3workNodes != null)
{
foreach (var node in ao3workNodes)
{
if (nodeChapter.ChildNodes.Contains(node))
{
nodeChapter.RemoveChild(node);
prt(" > ao3 work-div removed");
}
else
{
prt(" > ao3 work-div cannot be removed - skipping");
}
}
}
#endregion
#region Share Div
var shareNodes = nodeChapter.SelectNodes(@"div[@id='jp-post-flair' or contains(@class, 'sharedaddy') or contains(@class, 'sharing') or contains(@class, 'social')]");

View File

@@ -5,8 +5,10 @@ public enum Site
Wordpress,
WuxiaWorld,
Royalroad,
ArchiveOfOurOwn,
WP = Wordpress,
WW = WuxiaWorld,
RR = Royalroad,
WP = Wordpress,
WW = WuxiaWorld,
RR = Royalroad,
AO3 = ArchiveOfOurOwn,
}