From a75c8ee05a4eb0dfc5c3afda33ebd8db81a59da0 Mon Sep 17 00:00:00 2001 From: n0099 Date: Fri, 13 Jan 2023 04:17:21 +0800 Subject: [PATCH] * opt out entity splitting due to currently it only supporting `one-one` relationships instead of desired `one-zeroOrOne`(optional entity), leading to inserting many records with all fields except the primary key with NULL values as placeholder record for `one-one` relation, so without re-introducing something like `IRevision.NullFieldsBitMask`, we can't distinguish between the original literal NULL value and these empty records: https://github.com/dotnet/efcore/issues/27974 https://github.com/dotnet/efcore/issues/29113 @ `TbmDbContext.OnModelCreating()` + abstract class `RevisionWithSplitting` as base class of all derived classes of `IRevision` * insert all entities returned from `RevisionWithSplitting.GetSplitEntities()` into DB @ `CommonInSavers.SavePostsOrUsers()` * change `abstract class BaseRevision` to `interface IRevision` to comply with single inheritance @ crawler --- crawler/src/Db/Revision/BaseRevision.cs | 10 ----- crawler/src/Db/Revision/IRevision.cs | 9 ++++ crawler/src/Db/Revision/ReplyRevision.cs | 41 +++++++++++++++--- .../src/Db/Revision/RevisionWithSplitting.cs | 27 ++++++++++++ crawler/src/Db/Revision/SubReplyRevision.cs | 30 +++++++++++-- crawler/src/Db/Revision/ThreadRevision.cs | 19 ++++++-- crawler/src/Db/Revision/UserRevision.cs | 43 ++++++++++++++++--- crawler/src/Db/TbmDbContext.cs | 42 ++++++------------ crawler/src/Tieba/Crawl/Saver/BaseSaver.cs | 2 +- .../src/Tieba/Crawl/Saver/CommonInSavers.cs | 9 +++- 10 files changed, 172 insertions(+), 60 deletions(-) delete mode 100644 crawler/src/Db/Revision/BaseRevision.cs create mode 100644 crawler/src/Db/Revision/IRevision.cs create mode 100644 crawler/src/Db/Revision/RevisionWithSplitting.cs diff --git a/crawler/src/Db/Revision/BaseRevision.cs b/crawler/src/Db/Revision/BaseRevision.cs deleted file mode 100644 index 13360241..00000000 --- a/crawler/src/Db/Revision/BaseRevision.cs +++ /dev/null @@ -1,10 +0,0 @@ -// ReSharper disable UnusedAutoPropertyAccessor.Global -// ReSharper disable PropertyCanBeMadeInitOnly.Global -namespace tbm.Crawler.Db.Revision -{ - public abstract class BaseRevision - { - public uint TakenAt { get; set; } - public ushort? NullFieldsBitMask { get; set; } - } -} diff --git a/crawler/src/Db/Revision/IRevision.cs b/crawler/src/Db/Revision/IRevision.cs new file mode 100644 index 00000000..f0996d2b --- /dev/null +++ b/crawler/src/Db/Revision/IRevision.cs @@ -0,0 +1,9 @@ +// ReSharper disable UnusedMember.Global +namespace tbm.Crawler.Db.Revision +{ + public interface IRevision + { + public uint TakenAt { get; set; } + public ushort? NullFieldsBitMask { get; set; } + } +} diff --git a/crawler/src/Db/Revision/ReplyRevision.cs b/crawler/src/Db/Revision/ReplyRevision.cs index 8c7249c6..791bdaa5 100644 --- a/crawler/src/Db/Revision/ReplyRevision.cs +++ b/crawler/src/Db/Revision/ReplyRevision.cs @@ -1,14 +1,45 @@ // ReSharper disable PropertyCanBeMadeInitOnly.Global namespace tbm.Crawler.Db.Revision { - public class ReplyRevision : BaseRevision + public class ReplyRevision : ReplyRevision.BaseReplyRevision { - public ulong Pid { get; set; } - public uint Floor { get; set; } - public uint SubReplyCount { get; set; } + public abstract class BaseReplyRevision : RevisionWithSplitting + { + public ulong Pid { get; set; } + } + [NotMapped] public uint Floor + { + get => GetSplitEntityValue(r => r.Floor); + set => SetSplitEntityValue(value, (r, v) => r.Floor = v, + () => new() {TakenAt = TakenAt, Pid = Pid, Floor = value}); + } + [NotMapped] public uint SubReplyCount + { + get => GetSplitEntityValue(r => r.SubReplyCount); + set => SetSplitEntityValue(value, (r, v) => r.SubReplyCount = v, + () => new() {TakenAt = TakenAt, Pid = Pid, SubReplyCount = value}); + } public ushort? IsFold { get; set; } - public int AgreeCount { get; set; } + [NotMapped] public int AgreeCount + { + get => GetSplitEntityValue(r => r.AgreeCount); + set => SetSplitEntityValue(value, (r, v) => r.AgreeCount = v, + () => new() {TakenAt = TakenAt, Pid = Pid, AgreeCount = value}); + } public int? DisagreeCount { get; set; } public byte[]? Geolocation { get; set; } + + public class SplitFloor : BaseReplyRevision + { + public uint Floor { get; set; } + } + public class SplitSubReplyCount : BaseReplyRevision + { + public uint SubReplyCount { get; set; } + } + public class SplitAgreeCount : BaseReplyRevision + { + public int AgreeCount { get; set; } + } } } diff --git a/crawler/src/Db/Revision/RevisionWithSplitting.cs b/crawler/src/Db/Revision/RevisionWithSplitting.cs new file mode 100644 index 00000000..eba9cf28 --- /dev/null +++ b/crawler/src/Db/Revision/RevisionWithSplitting.cs @@ -0,0 +1,27 @@ +namespace tbm.Crawler.Db.Revision +{ + public abstract class RevisionWithSplitting : IRevision + { + public uint TakenAt { get; set; } + public ushort? NullFieldsBitMask { get; set; } + + private Dictionary SplitEntities { get; } = new(); + public IEnumerable GetSplitEntities() => SplitEntities.Values; + + protected TValue? GetSplitEntityValue(Func valueSelector) + where TSplitEntity : class, TSplitEntities => + SplitEntities.ContainsKey(typeof(TSplitEntity)) + ? valueSelector((TSplitEntity)SplitEntities[typeof(TSplitEntity)]!) + : default; + + protected void SetSplitEntityValue(TValue? value, + Action valueSetter, Func entityFactory) + where TSplitEntity : class, TSplitEntities + { + if (SplitEntities.ContainsKey(typeof(TSplitEntity))) + valueSetter((TSplitEntity)SplitEntities[typeof(TSplitEntity)]!, value); + else + SplitEntities[typeof(TSplitEntity)] = entityFactory(); + } + } +} diff --git a/crawler/src/Db/Revision/SubReplyRevision.cs b/crawler/src/Db/Revision/SubReplyRevision.cs index 07a97e73..03e054ea 100644 --- a/crawler/src/Db/Revision/SubReplyRevision.cs +++ b/crawler/src/Db/Revision/SubReplyRevision.cs @@ -1,10 +1,32 @@ // ReSharper disable PropertyCanBeMadeInitOnly.Global namespace tbm.Crawler.Db.Revision { - public class SubReplyRevision : BaseRevision + public class SubReplyRevision : SubReplyRevision.BaseSubReplyRevision { - public ulong Spid { get; set; } - public int AgreeCount { get; set; } - public int DisagreeCount { get; set; } + public abstract class BaseSubReplyRevision : RevisionWithSplitting + { + public ulong Spid { get; set; } + } + [NotMapped] public int AgreeCount + { + get => GetSplitEntityValue(r => r.AgreeCount); + set => SetSplitEntityValue(value, (r, v) => r.AgreeCount = v, + () => new() {TakenAt = TakenAt, Spid = Spid, AgreeCount = value}); + } + [NotMapped] public int DisagreeCount + { + get => GetSplitEntityValue(r => r.DisagreeCount); + set => SetSplitEntityValue(value, (r, v) => r.DisagreeCount = v, + () => new() {TakenAt = TakenAt, Spid = Spid, DisagreeCount = value}); + } + + public class SplitAgreeCount : BaseSubReplyRevision + { + public int AgreeCount { get; set; } + } + public class SplitDisagreeCount : BaseSubReplyRevision + { + public int DisagreeCount { get; set; } + } } } diff --git a/crawler/src/Db/Revision/ThreadRevision.cs b/crawler/src/Db/Revision/ThreadRevision.cs index 44733426..0725c0b9 100644 --- a/crawler/src/Db/Revision/ThreadRevision.cs +++ b/crawler/src/Db/Revision/ThreadRevision.cs @@ -1,9 +1,12 @@ // ReSharper disable PropertyCanBeMadeInitOnly.Global namespace tbm.Crawler.Db.Revision { - public class ThreadRevision : BaseRevision + public class ThreadRevision : ThreadRevision.BaseThreadRevision { - public ulong Tid { get; set; } + public abstract class BaseThreadRevision : RevisionWithSplitting + { + public ulong Tid { get; set; } + } public ulong? ThreadType { get; set; } public string? StickyType { get; set; } public string? TopicType { get; set; } @@ -11,10 +14,20 @@ public class ThreadRevision : BaseRevision public uint? LatestReplyPostedAt { get; set; } public long? LatestReplierUid { get; set; } public uint? ReplyCount { get; set; } - public uint ViewCount { get; set; } + [NotMapped] public uint ViewCount + { + get => GetSplitEntityValue(r => r.ViewCount); + set => SetSplitEntityValue(value, (r, v) => r.ViewCount = v, + () => new() {TakenAt = TakenAt, Tid = Tid, ViewCount = value}); + } public uint? ShareCount { get; set; } public int? AgreeCount { get; set; } public int? DisagreeCount { get; set; } public byte[]? Geolocation { get; set; } + + public class SplitViewCount : BaseThreadRevision + { + public uint ViewCount { get; set; } + } } } diff --git a/crawler/src/Db/Revision/UserRevision.cs b/crawler/src/Db/Revision/UserRevision.cs index 4da0a30e..7ef56503 100644 --- a/crawler/src/Db/Revision/UserRevision.cs +++ b/crawler/src/Db/Revision/UserRevision.cs @@ -2,16 +2,47 @@ // ReSharper disable PropertyCanBeMadeInitOnly.Global namespace tbm.Crawler.Db.Revision { - public class UserRevision : BaseRevision + public class UserRevision : UserRevision.BaseUserRevision { - public long Uid { get; set; } - public string TriggeredBy { get; set; } = ""; + public abstract class BaseUserRevision : RevisionWithSplitting + { + public long Uid { get; set; } + public string TriggeredBy { get; set; } = ""; + } public string? Name { get; set; } - public string? DisplayName { get; set; } + [NotMapped] public string? DisplayName + { + get => GetSplitEntityValue(r => r.DisplayName); + set => SetSplitEntityValue(value, (r, v) => r.DisplayName = v, + () => new() {TakenAt = TakenAt, Uid = Uid, TriggeredBy = TriggeredBy, DisplayName = value}); + } public string? Portrait { get; set; } - public uint? PortraitUpdatedAt { get; set; } + [NotMapped] public uint? PortraitUpdatedAt + { + get => GetSplitEntityValue(r => r.PortraitUpdatedAt); + set => SetSplitEntityValue(value, (r, v) => r.PortraitUpdatedAt = v, + () => new() {TakenAt = TakenAt, Uid = Uid, TriggeredBy = TriggeredBy, PortraitUpdatedAt = value}); + } public ushort? Gender { get; set; } public byte[]? Icon { get; set; } - public string? IpGeolocation { get; set; } + [NotMapped] public string? IpGeolocation + { + get => GetSplitEntityValue(r => r.IpGeolocation); + set => SetSplitEntityValue(value, (r, v) => r.IpGeolocation = v, + () => new() {TakenAt = TakenAt, Uid = Uid, TriggeredBy = TriggeredBy, IpGeolocation = value}); + } + + public class SplitDisplayName : BaseUserRevision + { + public string? DisplayName { get; set; } + } + public class SplitPortraitUpdatedAt : BaseUserRevision + { + public uint? PortraitUpdatedAt { get; set; } + } + public class SplitIpGeolocation : BaseUserRevision + { + public string? IpGeolocation { get; set; } + } } } diff --git a/crawler/src/Db/TbmDbContext.cs b/crawler/src/Db/TbmDbContext.cs index a5290ff8..1eb44006 100644 --- a/crawler/src/Db/TbmDbContext.cs +++ b/crawler/src/Db/TbmDbContext.cs @@ -40,35 +40,19 @@ protected override void OnModelCreating(ModelBuilder b) b.Entity().ToTable($"tbmc_f{Fid}_reply_content"); b.Entity().ToTable($"tbmc_f{Fid}_subReply"); b.Entity().ToTable($"tbmc_f{Fid}_subReply_content"); - b.Entity() - .SplitToTable("tbmc_revision_thread_viewCount", tb => tb.Property(e => e.ViewCount)) - .ToTable("tbmc_revision_thread").HasKey(e => new {e.Tid, e.TakenAt}); - b.Entity() - .SplitToTable("tbmc_revision_reply_agreeCount", tb => tb.Property(e => e.AgreeCount)) - .SplitToTable("tbmc_revision_reply_subReplyCount", tb => tb.Property(e => e.SubReplyCount)) - .SplitToTable("tbmc_revision_reply_floor", tb => tb.Property(e => e.Floor)) - .ToTable("tbmc_revision_reply").HasKey(e => new {e.Pid, e.TakenAt}); - b.Entity() - .SplitToTable("tbmc_revision_subReply_agreeCount", tb => tb.Property(e => e.AgreeCount)) - .SplitToTable("tbmc_revision_subReply_disagreeCount", tb => tb.Property(e => e.DisagreeCount)) - .ToTable("tbmc_revision_subReply").HasKey(e => new {e.Spid, e.TakenAt}); - b.Entity() - .SplitToTable("tbmc_revision_user_ipGeolocation", tb => - { - tb.Property(e => e.TriggeredBy); - tb.Property(e => e.IpGeolocation); - }) - .SplitToTable("tbmc_revision_user_portraitUpdatedAt", tb => - { - tb.Property(e => e.TriggeredBy); - tb.Property(e => e.PortraitUpdatedAt); - }) - .SplitToTable("tbmc_revision_user_displayName", tb => - { - tb.Property(e => e.TriggeredBy); - tb.Property(e => e.DisplayName); - }) - .ToTable("tbmc_revision_user").HasKey(e => new {e.Uid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_thread").HasKey(e => new {e.Tid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_thread_viewCount").HasKey(e => new {e.Tid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_reply").HasKey(e => new {e.Pid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_reply_agreeCount").HasKey(e => new {e.Pid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_reply_subReplyCount").HasKey(e => new {e.Pid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_reply_floor").HasKey(e => new {e.Pid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_subReply").HasKey(e => new {e.Spid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_subReply_agreeCount").HasKey(e => new {e.Spid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_subReply_disagreeCount").HasKey(e => new {e.Spid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_user").HasKey(e => new {e.Uid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_user_ipGeolocation").HasKey(e => new {e.Uid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_user_portraitUpdatedAt").HasKey(e => new {e.Uid, e.TakenAt}); + b.Entity().ToTable("tbmc_revision_user_displayName").HasKey(e => new {e.Uid, e.TakenAt}); b.Entity().ToTable("tbmc_revision_authorExpGrade").HasKey(e => new {e.Fid, e.Uid, e.DiscoveredAt}); b.Entity().ToTable("tbmc_revision_forumModerator"); b.Entity().ToTable("tbm_forum"); diff --git a/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs b/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs index 23f4049d..9a274e0d 100644 --- a/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs +++ b/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs @@ -33,7 +33,7 @@ protected SaverChangeSet SavePosts(TbmDbContext db, ExpressionStarter existingPostPredicate, Func, Expression>> existingRevisionPredicate, Expression> revisionKeySelector) - where TRevision : BaseRevision, new() + where TRevision : class, IRevision, new() { var dbSet = db.Set().TagWith("ForUpdate"); if (dbSet == null) throw new ArgumentException( diff --git a/crawler/src/Tieba/Crawl/Saver/CommonInSavers.cs b/crawler/src/Tieba/Crawl/Saver/CommonInSavers.cs index 8a12e0d7..c763226b 100644 --- a/crawler/src/Tieba/Crawl/Saver/CommonInSavers.cs +++ b/crawler/src/Tieba/Crawl/Saver/CommonInSavers.cs @@ -19,9 +19,10 @@ protected void SavePostsOrUsers( Func revisionPostOrUserIdSelector, Func, Expression>> existingRevisionPredicate, Expression> revisionKeySelector) - where TPostOrUser : class where TRevision : BaseRevision, new() + where TPostOrUser : class where TRevision : class, IRevision, new() { db.Set().AddRange(existingOrNewLookup[false]); // newly added + db.TimestampingEntities(); var newRevisions = existingOrNewLookup[true].Select(newPostOrUser => { var postOrUserInTracking = existingSelector(newPostOrUser); @@ -101,7 +102,11 @@ or nameof(ITimestampingEntity.CreatedAt) if (revision != null) revision.NullFieldsBitMask = (ushort?)revisionNullFieldsBitMask; return revision; }).OfType().ToList(); - db.TimestampingEntities(); + + db.AddRange(newRevisions.OfType().Select(r => r.GetSplitEntities())); + db.AddRange(newRevisions.OfType().Select(r => r.GetSplitEntities())); + db.AddRange(newRevisions.OfType().Select(r => r.GetSplitEntities())); + db.AddRange(newRevisions.OfType().Select(r => r.GetSplitEntities())); if (!newRevisions.Any()) return; // quick exit to prevent execute sql with WHERE FALSE clause var existingRevisions = db.Set()