Skip to content

Commit e6f94bf

Browse files
committed
Add photos_count field and optional extra-photos extraction
- Add PhotosCount field to Entry struct, extracted from darray[37][1] - Add Date field to Image struct for photo publication dates - Add -extra-photos CLI flag to enable enhanced photo extraction - When enabled, extracts dates for category images and individual photos - Wire ExtractExtraPhotos through job chain (GmapJob → PlaceJob) - Update all runners (file, database, web, lambda) to support new flag Photo dates are extracted from [21][6][8] as [year, month, day] and formatted as YYYY-MM-DD.
1 parent 65e8896 commit e6f94bf

11 files changed

Lines changed: 118 additions & 3 deletions

File tree

gmaps/entry.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
type Image struct {
1616
Title string `json:"title"`
1717
Image string `json:"image"`
18+
Date string `json:"date,omitempty"`
1819
}
1920

2021
type LinkSource struct {
@@ -247,6 +248,93 @@ func (e *Entry) AddExtraReviews(pages [][]byte) {
247248
}
248249
}
249250

251+
// AddExtraPhotos extracts additional photo data including dates for existing images
252+
// and individual photos from the raw JSON data.
253+
func (e *Entry) AddExtraPhotos(raw []byte) {
254+
var jd []any
255+
if err := json.Unmarshal(raw, &jd); err != nil {
256+
return
257+
}
258+
259+
if len(jd) < 7 {
260+
return
261+
}
262+
263+
darray, ok := jd[6].([]any)
264+
if !ok {
265+
return
266+
}
267+
268+
// Extract dates for existing category images from darray[171][0]
269+
catImages := getNthElementAndCast[[]any](darray, 171, 0)
270+
for i := range e.Images {
271+
if i < len(catImages) {
272+
cat := getNthElementAndCast[[]any](catImages, i)
273+
// Date is at cat[3][0][21][6][8]
274+
dateArr := getNthElementAndCast[[]any](cat, 3, 0, 21, 6, 8)
275+
e.Images[i].Date = formatPhotoDate(dateArr)
276+
}
277+
}
278+
279+
// Extract individual photos from darray[37][0]
280+
photoObjects := getNthElementAndCast[[]any](darray, 37, 0)
281+
for i := range photoObjects {
282+
photo := getNthElementAndCast[[]any](photoObjects, i)
283+
if len(photo) == 0 {
284+
continue
285+
}
286+
287+
photoID := getNthElementAndCast[string](photo, 0)
288+
photoURL := getNthElementAndCast[string](photo, 6, 0)
289+
photoLabel := getNthElementAndCast[string](photo, 20)
290+
dateArr := getNthElementAndCast[[]any](photo, 21, 6, 8)
291+
292+
if photoURL == "" {
293+
continue
294+
}
295+
296+
// Use label as title, fallback to "Photo"
297+
title := photoLabel
298+
if title == "" {
299+
title = fmt.Sprintf("Photo %d", i+1)
300+
}
301+
302+
// Check if this photo is already in Images (by URL or ID)
303+
alreadyExists := false
304+
for _, img := range e.Images {
305+
if strings.Contains(img.Image, photoID) {
306+
alreadyExists = true
307+
break
308+
}
309+
}
310+
311+
if !alreadyExists {
312+
e.Images = append(e.Images, Image{
313+
Title: title,
314+
Image: photoURL,
315+
Date: formatPhotoDate(dateArr),
316+
})
317+
}
318+
}
319+
}
320+
321+
// formatPhotoDate converts a date array [year, month, day, hour] to "YYYY-MM-DD" format.
322+
func formatPhotoDate(dateArr []any) string {
323+
if len(dateArr) < 3 {
324+
return ""
325+
}
326+
327+
year := int(getNthElementAndCast[float64](dateArr, 0))
328+
month := int(getNthElementAndCast[float64](dateArr, 1))
329+
day := int(getNthElementAndCast[float64](dateArr, 2))
330+
331+
if year == 0 || month == 0 || day == 0 {
332+
return ""
333+
}
334+
335+
return fmt.Sprintf("%04d-%02d-%02d", year, month, day)
336+
}
337+
250338
func extractReviews(data []byte) []Review {
251339
// Skip the security prefix
252340
prefix := ")]}'\n"

gmaps/job.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ type GmapJob struct {
2929
Deduper deduper.Deduper
3030
ExitMonitor exiter.Exiter
3131
ExtractExtraReviews bool
32+
ExtractExtraPhotos bool
3233
}
3334

3435
func NewGmapJob(
@@ -97,6 +98,12 @@ func WithExtraReviews() GmapJobOptions {
9798
}
9899
}
99100

101+
func WithExtraPhotos() GmapJobOptions {
102+
return func(j *GmapJob) {
103+
j.ExtractExtraPhotos = true
104+
}
105+
}
106+
100107
func (j *GmapJob) UseInResults() bool {
101108
return false
102109
}
@@ -122,7 +129,7 @@ func (j *GmapJob) Process(ctx context.Context, resp *scrapemate.Response) (any,
122129
jopts = append(jopts, WithPlaceJobExitMonitor(j.ExitMonitor))
123130
}
124131

125-
placeJob := NewPlaceJob(j.ID, j.LangCode, resp.URL, j.ExtractEmail, j.ExtractExtraReviews, jopts...)
132+
placeJob := NewPlaceJob(j.ID, j.LangCode, resp.URL, j.ExtractEmail, j.ExtractExtraReviews, j.ExtractExtraPhotos, jopts...)
126133

127134
next = append(next, placeJob)
128135
} else {
@@ -133,7 +140,7 @@ func (j *GmapJob) Process(ctx context.Context, resp *scrapemate.Response) (any,
133140
jopts = append(jopts, WithPlaceJobExitMonitor(j.ExitMonitor))
134141
}
135142

136-
nextJob := NewPlaceJob(j.ID, j.LangCode, href, j.ExtractEmail, j.ExtractExtraReviews, jopts...)
143+
nextJob := NewPlaceJob(j.ID, j.LangCode, href, j.ExtractEmail, j.ExtractExtraReviews, j.ExtractExtraPhotos, jopts...)
137144

138145
if j.Deduper == nil || j.Deduper.AddIfNotExists(ctx, href) {
139146
next = append(next, nextJob)

gmaps/place.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,10 @@ type PlaceJob struct {
2323
ExtractEmail bool
2424
ExitMonitor exiter.Exiter
2525
ExtractExtraReviews bool
26+
ExtractExtraPhotos bool
2627
}
2728

28-
func NewPlaceJob(parentID, langCode, u string, extractEmail, extraExtraReviews bool, opts ...PlaceJobOptions) *PlaceJob {
29+
func NewPlaceJob(parentID, langCode, u string, extractEmail, extraExtraReviews, extraPhotos bool, opts ...PlaceJobOptions) *PlaceJob {
2930
const (
3031
defaultPrio = scrapemate.PriorityMedium
3132
defaultMaxRetries = 3
@@ -46,6 +47,7 @@ func NewPlaceJob(parentID, langCode, u string, extractEmail, extraExtraReviews b
4647
job.UsageInResultststs = true
4748
job.ExtractEmail = extractEmail
4849
job.ExtractExtraReviews = extraExtraReviews
50+
job.ExtractExtraPhotos = extraPhotos
4951

5052
for _, opt := range opts {
5153
opt(&job)
@@ -77,6 +79,10 @@ func (j *PlaceJob) Process(_ context.Context, resp *scrapemate.Response) (any, [
7779
return nil, nil, err
7880
}
7981

82+
if j.ExtractExtraPhotos {
83+
entry.AddExtraPhotos(raw)
84+
}
85+
8086
entry.ID = j.ParentID
8187

8288
if entry.Link == "" {

runner/databaserunner/databaserunner.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ func (d *dbrunner) produceSeedJobs(ctx context.Context) error {
152152
nil,
153153
nil,
154154
d.cfg.ExtraReviews,
155+
d.cfg.ExtraPhotos,
155156
)
156157
if err != nil {
157158
return err

runner/filerunner/filerunner.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ func (r *fileRunner) Run(ctx context.Context) (err error) {
8888
dedup,
8989
exitMonitor,
9090
r.cfg.ExtraReviews,
91+
r.cfg.ExtraPhotos,
9192
)
9293
if err != nil {
9394
return err

runner/jobs.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ func CreateSeedJobs(
2828
dedup deduper.Deduper,
2929
exitMonitor exiter.Exiter,
3030
extraReviews bool,
31+
extraPhotos bool,
3132
) (jobs []scrapemate.IJob, err error) {
3233
var lat, lon float64
3334

@@ -100,6 +101,10 @@ func CreateSeedJobs(
100101
opts = append(opts, gmaps.WithExtraReviews())
101102
}
102103

104+
if extraPhotos {
105+
opts = append(opts, gmaps.WithExtraPhotos())
106+
}
107+
103108
job = gmaps.NewGmapJob(id, langCode, query, maxDepth, email, geoCoordinates, zoom, opts...)
104109
} else {
105110
jparams := gmaps.MapSearchParams{

runner/lambdaaws/invoker.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ func (i *invoker) setPayloads(cfg *runner.Config) error {
129129
Language: cfg.LangCode,
130130
FunctionName: cfg.FunctionName,
131131
ExtraReviews: cfg.ExtraReviews,
132+
ExtraPhotos: cfg.ExtraPhotos,
132133
}
133134
i.payloads = append(i.payloads, payload)
134135

@@ -148,6 +149,7 @@ func (i *invoker) setPayloads(cfg *runner.Config) error {
148149
Language: cfg.LangCode,
149150
FunctionName: cfg.FunctionName,
150151
ExtraReviews: cfg.ExtraReviews,
152+
ExtraPhotos: cfg.ExtraPhotos,
151153
}
152154
i.payloads = append(i.payloads, payload)
153155
}

runner/lambdaaws/io.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ type lInput struct {
1111
FunctionName string `json:"function_name"`
1212
DisablePageReuse bool `json:"disable_page_reuse"`
1313
ExtraReviews bool `json:"extra_reviews"`
14+
ExtraPhotos bool `json:"extra_photos"`
1415
}

runner/lambdaaws/lambdaaws.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ func (l *lambdaAwsRunner) handler(ctx context.Context, input lInput) error {
9090
nil,
9191
exitMonitor,
9292
input.ExtraReviews,
93+
input.ExtraPhotos,
9394
)
9495
if err != nil {
9596
return err

runner/runner.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ type Config struct {
8080
DisablePageReuse bool
8181
ExtraReviews bool
8282
LeadsDBAPIKey string
83+
ExtraPhotos bool
8384
}
8485

8586
func ParseConfig() *Config {
@@ -127,6 +128,7 @@ func ParseConfig() *Config {
127128
flag.BoolVar(&cfg.DisablePageReuse, "disable-page-reuse", false, "disable page reuse in playwright")
128129
flag.BoolVar(&cfg.ExtraReviews, "extra-reviews", false, "enable extra reviews collection")
129130
flag.StringVar(&cfg.LeadsDBAPIKey, "leadsdb-api-key", "", "LeadsDB API key for exporting results to LeadsDB")
131+
flag.BoolVar(&cfg.ExtraPhotos, "extra-photos", false, "enable extra photos collection (includes dates and individual photos)")
130132

131133
flag.Parse()
132134

0 commit comments

Comments
 (0)