Skip to content

Commit d2f5fee

Browse files
committed
+ const fields `(Legacy)ClientApiDomain` for classes `EntryPoint`, `ThreadCrawler` and `ThreadLateCrawlerAndSaver` @ ClientRequester.cs * replace the `HttpClient.BaseAddress` from `http://c.tieba.baidu.com` to `http://tiebac.baidu.com` in favor of lumina37/aiotieba#123 (comment) @ `EntryPoint.ConfigureServices()` + const field `LegacyEndPointUrl` to allow method `GetRequestsForPage()` and the one in derived class `ThreadArchiveCrawler` to use the original domain @ ThreadCrawler.cs @ crawler + private field `_logger` and static field `ExtractMalformedExifDateTimeRegex` + method `ParseExifDateTimeOrNull()` to handle malformed EXIF date time string @ MetadataConsumer.cs * change the type of fields `Exif.(Create|Modify)Date` from `string` to `DateTime` @ ImageMetadata.cs * now will log different message when the `ImageInReply.ExpectedByteSize==0` @ `ImageRequester.GetImageBytes()` @ imagePipeline @ c#
1 parent 692b9a6 commit d2f5fee

File tree

8 files changed

+75
-19
lines changed

8 files changed

+75
-19
lines changed

c#/crawler/src/EntryPoint.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ protected override void ConfigureServices(HostBuilderContext context, IServiceCo
2121
var clientRequesterConfig = context.Configuration.GetSection("ClientRequester");
2222
service.AddHttpClient("tbClient", client =>
2323
{
24-
client.BaseAddress = new("http://c.tieba.baidu.com");
24+
client.BaseAddress = new(ClientRequester.ClientApiDomain);
2525
client.Timeout = TimeSpan.FromMilliseconds(clientRequesterConfig.GetValue("TimeoutMs", 3000));
2626
})
2727
.SetHandlerLifetime(TimeSpan.FromSeconds(clientRequesterConfig.GetValue("HandlerLifetimeSec", 600))) // 10 mins

c#/crawler/src/Tieba/ClientRequester.cs

+4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ namespace tbm.Crawler.Tieba;
55

66
public class ClientRequester
77
{
8+
// https://github.com/Starry-OvO/aiotieba/issues/123#issuecomment-1563314122
9+
public const string LegacyClientApiDomain = "http://c.tieba.baidu.com";
10+
public const string ClientApiDomain = "http://tiebac.baidu.com";
11+
812
private readonly ILogger<ClientRequester> _logger;
913
private readonly IConfigurationSection _config;
1014
private readonly IHttpClientFactory _httpFactory;

c#/crawler/src/Tieba/Crawl/Crawler/ThreadArchiveCrawler.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ public ThreadArchiveCrawler(ClientRequester requester, string forumName) : base(
88

99
protected override IEnumerable<Request> GetRequestsForPage(Page page, CancellationToken stoppingToken = default)
1010
{
11-
var response = Requester.RequestProtoBuf(EndPointUrl, "6.0.2",
11+
var response = Requester.RequestProtoBuf(LegacyEndPointUrl, "6.0.2",
1212
new ThreadRequest {Data = GetRequestDataForClientVersion602(page)},
1313
(req, common) => req.Data.Common = common,
1414
() => new ThreadResponse(), stoppingToken);

c#/crawler/src/Tieba/Crawl/Crawler/ThreadCrawler.cs

+3-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ public override Exception FillExceptionData(Exception e)
2020
public override TbClient.Page? GetResponsePage(ThreadResponse response) =>
2121
response.Data?.Page; // response.Data.Page will be null when it's requested with CrawlRequestFlag.ThreadClientVersion8888
2222

23-
protected const string EndPointUrl = "c/f/frs/page?cmd=301001";
23+
private const string EndPointUrl = "c/f/frs/page?cmd=301001";
24+
protected const string LegacyEndPointUrl = $"{ClientRequester.LegacyClientApiDomain}/{EndPointUrl}";
2425

2526
protected ThreadRequest.Types.Data GetRequestDataForClientVersion602(Page page) =>
2627
new()
@@ -47,7 +48,7 @@ protected override IEnumerable<Request> GetRequestsForPage(Page page, Cancellati
4748
new ThreadRequest {Data = data},
4849
(req, common) => req.Data.Common = common,
4950
() => new ThreadResponse(), stoppingToken)),
50-
new Request(Requester.RequestProtoBuf(EndPointUrl, "6.0.2",
51+
new Request(Requester.RequestProtoBuf(LegacyEndPointUrl, "6.0.2",
5152
new ThreadRequest {Data = data602},
5253
(req, common) => req.Data.Common = common,
5354
() => new ThreadResponse(), stoppingToken), CrawlRequestFlag.ThreadClientVersion602)

c#/crawler/src/Tieba/Crawl/ThreadLateCrawlerAndSaver.cs

+11-8
Original file line numberDiff line numberDiff line change
@@ -33,24 +33,27 @@ public async Task Crawl(Dictionary<Tid, FailureCount> failureCountsKeyByTid, Can
3333
if (!_locks.AcquireRange(crawlerLockId, new[] {(Page)1}).Any()) return null;
3434
try
3535
{
36-
var json = await _requester.RequestJson("c/f/pb/page", "8.8.8.8", new()
37-
{
38-
{"kz", tid.ToString()},
39-
{"pn", "1"},
40-
{"rn", "2"} // have to be at least 2, since response will always be error code 29 and msg "这个楼层可能已被删除啦,去看看其他贴子吧" with rn=1
41-
}, stoppingToken);
36+
var json = await _requester.RequestJson(
37+
$"{ClientRequester.LegacyClientApiDomain}/c/f/pb/page", "8.8.8.8", new()
38+
{
39+
{"kz", tid.ToString()},
40+
{"pn", "1"},
41+
// rn have to be at least 2
42+
// since response will always be error code 29 and msg "这个楼层可能已被删除啦,去看看其他贴子吧" with rn=1
43+
{"rn", "2"}
44+
}, stoppingToken);
4245
try
4346
{
4447
var errorCodeProp = json.GetProperty("error_code");
4548
Func<(int ErrorCode, bool IsErrorCodeParsed)> tryGetErrorCode = errorCodeProp.ValueKind switch
4649
{ // https://github.com/MoeNetwork/Tieba-Cloud-Sign/pull/220#issuecomment-1367570540
4750
JsonValueKind.Number => () =>
48-
{ // https://stackoverflow.com/questions/62100000/why-doesnt-system-text-json-jsonelement-have-trygetstring-or-trygetboolean/62100246#62100246
51+
{
4952
var r = errorCodeProp.TryGetInt32(out var p);
5053
return (p, r);
5154
},
5255
JsonValueKind.String => () =>
53-
{
56+
{ // https://stackoverflow.com/questions/62100000/why-doesnt-system-text-json-jsonelement-have-trygetstring-or-trygetboolean/62100246#62100246
5457
var r = int.TryParse(errorCodeProp.GetString(), out var p);
5558
return (p, r);
5659
},

c#/imagePipeline/src/Consumer/MetadataConsumer.cs

+47-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
using System.Globalization;
12
using System.IO.Hashing;
23
using System.Text.Json;
4+
using System.Text.RegularExpressions;
35
using SixLabors.ImageSharp.Formats.Bmp;
46
using SixLabors.ImageSharp.Formats.Gif;
57
using SixLabors.ImageSharp.Formats.Jpeg;
@@ -8,13 +10,23 @@
810

911
namespace tbm.ImagePipeline.Consumer;
1012

11-
public class MetadataConsumer
13+
public partial class MetadataConsumer
1214
{
15+
16+
[GeneratedRegex( // should able to guess malformed string, e.g. 2018:09:08 15:288
17+
"^(?<year>(?:19|20|21)[0-9]{2}):?(?<month>[0-1]?[0-9]):?(?<day>[0-1]?[0-9]) (?<hour>[0-2]?[0-9]):?(?<minute>[0-5]?[0-9]):?(?<second>[0-5]?[0-9])$",
18+
RegexOptions.Compiled, matchTimeoutMilliseconds: 100)]
19+
private static partial Regex ExtractMalformedExifDateTimeRegex();
20+
21+
private readonly ILogger<MetadataConsumer> _logger;
1322
private readonly ulong[] _commonIccProfilesXxHash3ToIgnore;
1423

15-
public MetadataConsumer(IConfiguration config) =>
24+
public MetadataConsumer(ILogger<MetadataConsumer> logger, IConfiguration config)
25+
{
26+
_logger = logger;
1627
_commonIccProfilesXxHash3ToIgnore = config.GetSection("MetadataConsumer")
1728
.GetSection("CommonIccProfilesXxHash3ToIgnore").Get<ulong[]>() ?? Array.Empty<ulong>();
29+
}
1830

1931
public void Consume(
2032
ImagePipelineDbContext db,
@@ -63,8 +75,8 @@ public void Consume(
6375
: null,
6476
Make = GetExifTagValueOrNull(ExifTag.Make).NullIfEmpty(),
6577
Model = GetExifTagValueOrNull(ExifTag.Model).NullIfEmpty(),
66-
CreateDate = GetExifTagValueOrNull(ExifTag.DateTimeDigitized).NullIfEmpty(),
67-
ModifyDate = GetExifTagValueOrNull(ExifTag.DateTime).NullIfEmpty(),
78+
CreateDate = ParseExifDateTimeOrNull(GetExifTagValueOrNull(ExifTag.DateTimeDigitized)),
79+
ModifyDate = ParseExifDateTimeOrNull(GetExifTagValueOrNull(ExifTag.DateTime)),
6880
TagNames = JsonSerializer.Serialize(meta.ExifProfile.Values.Select(i => i.Tag.ToString())),
6981
RawBytes = meta.ExifProfile.ToByteArray() ?? throw new NullReferenceException()
7082
},
@@ -79,4 +91,35 @@ public void Consume(
7991
XxHash3 = XxHash3.HashToUInt64(imageBytes)
8092
};
8193
}));
94+
95+
private DateTime? ParseExifDateTimeOrNull(string? exifDateTime)
96+
{
97+
static DateTime? ParseDateTimeWithFormatOrNull(string? dateTime) =>
98+
DateTime.TryParseExact(dateTime, "yyyy:M:d H:m:s", CultureInfo.InvariantCulture,
99+
DateTimeStyles.None, out var ret) ? ret : null;
100+
101+
if (string.IsNullOrEmpty(exifDateTime)) return null;
102+
var originalDateTime = ParseDateTimeWithFormatOrNull(exifDateTime);
103+
if (originalDateTime != null) return originalDateTime;
104+
105+
// try to extract parts in malformed date time then try parse the parts composed formatted string
106+
// e.g. 2018:09:08 15:288 -> 2018:09:08 15:28:08
107+
// doing this should extract date time values from raw EXIF bytes as much as possible
108+
// since they usually only done by once for all
109+
var match = ExtractMalformedExifDateTimeRegex().Match(exifDateTime);
110+
if (!match.Success)
111+
{
112+
_logger.LogWarning("Unable to extract parts from malformed exif date time {}", exifDateTime);
113+
return null;
114+
}
115+
var ret = ParseDateTimeWithFormatOrNull( // sync with format "yyyy:M:d H:m:s"
116+
$"{match.Groups["year"]}:{match.Groups["month"]}:{match.Groups["day"]} "
117+
+ $"{match.Groups["hour"]}:{match.Groups["minute"]}:{match.Groups["second"]}");
118+
if (ret == null)
119+
_logger.LogWarning("Unable to extract parts from malformed exif date time {}", exifDateTime);
120+
else
121+
_logger.LogWarning("Converted malformed exif date time {} to {:yyyy:MM:D HH:mm:ss}",
122+
exifDateTime, ret);
123+
return ret;
124+
}
82125
}

c#/imagePipeline/src/Db/ImageMetadata.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ public class Exif : IImageMetadata
4949
public string? Orientation { get; set; }
5050
public string? Make { get; set; }
5151
public string? Model { get; set; }
52-
public string? CreateDate { get; set; }
53-
public string? ModifyDate { get; set; }
52+
public DateTime? CreateDate { get; set; }
53+
public DateTime? ModifyDate { get; set; }
5454
public required string TagNames { get; set; }
5555
public required byte[] RawBytes { get; set; }
5656

c#/imagePipeline/src/ImageRequester.cs

+6-1
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,13 @@ public async Task<byte[]> GetImageBytes(ImageInReply imageInReply, CancellationT
2424
var expectedByteSize = imageInReply.ExpectedByteSize;
2525
var http = _httpFactory.CreateClient("tbImage");
2626
if (_config.GetValue("LogTrace", false))
27-
_logger.LogTrace("Requesting image {} and expecting {} bytes of file size",
27+
{
28+
if (expectedByteSize == 0)
29+
_logger.LogTrace("Requesting image {} and not expecting determined byte size", urlFilename);
30+
else
31+
_logger.LogTrace("Requesting image {} and expecting {} bytes of file size",
2832
urlFilename, expectedByteSize);
33+
}
2934

3035
Context CreatePollyContext() => new() {{"ILogger<ImageRequester>", _logger}, {"imageUrlFilename", urlFilename}};
3136
Task<T> ExecuteByPolly<T>(Func<Task<T>> action) =>

0 commit comments

Comments
 (0)