Hacker News Job Listing Scraper and Parser


Nội dung File JSON

{
    "id": "0JsHmmyeHw5Ffz5m",
    "meta": {
        "instanceId": "d4d7965840e96e50a3e02959a8487c692901dfa8d5cc294134442c67ce1622d3",
        "templateCredsSetupCompleted": true
    },
    "name": "HN Who is Hiring Scrape",
    "tags": [],
    "nodes": [
        {
            "id": "f7cdb3ee-9bb0-4006-829a-d4ce797191d5",
            "name": "When clicking ‘Test workflow’",
            "type": "n8n-nodes-base.manualTrigger",
            "position": [
                -20,
                -220
            ],
            "parameters": {},
            "typeVersion": 1
        },
        {
            "id": "0475e25d-9bf4-450d-abd3-a04608a438a4",
            "name": "Sticky Note",
            "type": "n8n-nodes-base.stickyNote",
            "position": [
                60,
                -620
            ],
            "parameters": {
                "width": 460,
                "height": 340,
                "content": "## Go to https://hn.algolia.com\n- filter by \"Ask HN: Who is hiring?\" (important with quotes for full match)\n- sort by date\n- Chrome Network Tab > find API call > click \"Copy as cURL\"\n- n8n HTTP node -> import cURL and paste \n- I''ve set the API key as Header Auth so you will have to do the above yourself to make this work"
            },
            "typeVersion": 1
        },
        {
            "id": "a686852b-ff84-430b-92bb-ce02a6808e19",
            "name": "Split Out",
            "type": "n8n-nodes-base.splitOut",
            "position": [
                400,
                -220
            ],
            "parameters": {
                "options": {},
                "fieldToSplitOut": "hits"
            },
            "typeVersion": 1
        },
        {
            "id": "cdaaa738-d561-4fa0-b2c7-8ea9e6778eb1",
            "name": "Sticky Note1",
            "type": "n8n-nodes-base.stickyNote",
            "position": [
                1260,
                -620
            ],
            "parameters": {
                "width": 500,
                "height": 340,
                "content": "## Go to HN API \nhttps://github.com/HackerNews/API\n\nWe''ll need following endpoints: \n- For example, a story: https://hacker-news.firebaseio.com/v0/item/8863.json?print=pretty\n- comment: https://hacker-news.firebaseio.com/v0/item/2921983.json?print=pretty\n\n"
            },
            "typeVersion": 1
        },
        {
            "id": "4f353598-9e32-4be4-9e7b-c89cc05305fd",
            "name": "OpenAI Chat Model",
            "type": "@n8n/n8n-nodes-langchain.lmChatOpenAi",
            "position": [
                2680,
                -20
            ],
            "parameters": {
                "model": {
                    "__rl": true,
                    "mode": "list",
                    "value": "gpt-4o-mini"
                },
                "options": {}
            },
            "credentials": {
                "openAiApi": {
                    "id": "Fbb2ueT0XP5xMRme",
                    "name": "OpenAi account 2"
                }
            },
            "typeVersion": 1.2
        },
        {
            "id": "5bd0d7cc-497a-497c-aa4c-589d9ceeca14",
            "name": "Structured Output Parser",
            "type": "@n8n/n8n-nodes-langchain.outputParserStructured",
            "position": [
                2840,
                -20
            ],
            "parameters": {
                "schemaType": "manual",
                "inputSchema": "{\n \"type\": \"object\",\n \"properties\": {\n \"company\": {\n \"type\": [\n \"string\",\n null\n ],\n \"description\": \"Name of the hiring company\"\n },\n \"title\": {\n \"type\": [\n \"string\",\n null\n ],\n \"description\": \"Job title/role being advertised\"\n },\n \"location\": {\n \"type\": [\n \"string\",\n null\n ],\n \"description\": \"Work location including remote/hybrid status\"\n },\n \"type\": {\n \"type\": [\n \"string\",\n null\n ],\n \"enum\": [\n \"FULL_TIME\",\n \"PART_TIME\",\n \"CONTRACT\",\n \"INTERNSHIP\",\n \"FREELANCE\",\n null\n ],\n \"description\": \"Employment type (Full-time, Contract, etc)\"\n },\n \"work_location\": {\n \"type\": [\n \"string\",\n null\n ],\n \"enum\": [\n \"REMOTE\",\n \"HYBRID\",\n \"ON_SITE\",\n null\n ],\n \"description\": \"Work arrangement type\"\n },\n \"salary\": {\n \"type\": [\n \"string\",\n null\n ],\n \"description\": \"Compensation details if provided\"\n },\n \"description\": {\n \"type\": [\n \"string\",\n null\n ],\n \"description\": \"Main job description text including requirements and team info\"\n },\n \"apply_url\": {\n \"type\": [\n \"string\",\n null\n ],\n \"description\": \"Direct application/job posting URL\"\n },\n \"company_url\": {\n \"type\": [\n \"string\",\n null\n ],\n \"description\": \"Company website or careers page\"\n }\n }\n}\n"
            },
            "typeVersion": 1.2
        },
        {
            "id": "b84ca004-6f3b-4577-8910-61b8584b161d",
            "name": "Search for Who is hiring posts",
            "type": "n8n-nodes-base.httpRequest",
            "position": [
                200,
                -220
            ],
            "parameters": {
                "url": "https://uj5wyc0l7x-dsn.algolia.net/1/indexes/Item_dev_sort_date/query",
                "method": "POST",
                "options": {},
                "jsonBody": "{\n \"query\": \"\\\"Ask HN: Who is hiring\\\"\",\n \"analyticsTags\": [\n \"web\"\n ],\n \"page\": 0,\n \"hitsPerPage\": 30,\n \"minWordSizefor1Typo\": 4,\n \"minWordSizefor2Typos\": 8,\n \"advancedSyntax\": true,\n \"ignorePlurals\": false,\n \"clickAnalytics\": true,\n \"minProximity\": 7,\n \"numericFilters\": [],\n \"tagFilters\": [\n [\n \"story\"\n ],\n []\n ],\n \"typoTolerance\": \"min\",\n \"queryType\": \"prefixNone\",\n \"restrictSearchableAttributes\": [\n \"title\",\n \"comment_text\",\n \"url\",\n \"story_text\",\n \"author\"\n ],\n \"getRankingInfo\": true\n}",
                "sendBody": true,
                "sendQuery": true,
                "sendHeaders": true,
                "specifyBody": "json",
                "authentication": "genericCredentialType",
                "genericAuthType": "httpHeaderAuth",
                "queryParameters": {
                    "parameters": [
                        {
                            "name": "x-algolia-agent",
                            "value": "Algolia for JavaScript (4.13.1); Browser (lite)"
                        },
                        {
                            "name": "x-algolia-application-id",
                            "value": "UJ5WYC0L7X"
                        }
                    ]
                },
                "headerParameters": {
                    "parameters": [
                        {
                            "name": "Accept",
                            "value": "*/*"
                        },
                        {
                            "name": "Accept-Language",
                            "value": "en-GB,en-US;q=0.9,en;q=0.8"
                        },
                        {
                            "name": "Connection",
                            "value": "keep-alive"
                        },
                        {
                            "name": "DNT",
                            "value": "1"
                        },
                        {
                            "name": "Origin",
                            "value": "https://hn.algolia.com"
                        },
                        {
                            "name": "Referer",
                            "value": "https://hn.algolia.com/"
                        },
                        {
                            "name": "Sec-Fetch-Dest",
                            "value": "empty"
                        },
                        {
                            "name": "Sec-Fetch-Mode",
                            "value": "cors"
                        },
                        {
                            "name": "Sec-Fetch-Site",
                            "value": "cross-site"
                        },
                        {
                            "name": "User-Agent",
                            "value": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
                        },
                        {
                            "name": "sec-ch-ua",
                            "value": "\"Chromium\";v=\"133\", \"Not(A:Brand\";v=\"99\""
                        },
                        {
                            "name": "sec-ch-ua-mobile",
                            "value": "?0"
                        },
                        {
                            "name": "sec-ch-ua-platform",
                            "value": "\"macOS\""
                        }
                    ]
                }
            },
            "credentials": {
                "httpHeaderAuth": {
                    "id": "oVEXp2ZbYCXypMVz",
                    "name": "Algolia Auth"
                }
            },
            "typeVersion": 4.2
        },
        {
            "id": "205e66f6-cd6b-4cfd-a6ec-2226c35ddaac",
            "name": "Get relevant data",
            "type": "n8n-nodes-base.set",
            "position": [
                700,
                -220
            ],
            "parameters": {
                "options": {},
                "assignments": {
                    "assignments": [
                        {
                            "id": "73dd2325-faa7-4650-bd78-5fc97cc202de",
                            "name": "title",
                            "type": "string",
                            "value": "={{ $json.title }}"
                        },
                        {
                            "id": "44918eac-4510-440e-9ac0-bf14d2b2f3af",
                            "name": "createdAt",
                            "type": "string",
                            "value": "={{ $json.created_at }}"
                        },
                        {
                            "id": "00eb6f09-2c22-411c-949c-886b2d95b6eb",
                            "name": "updatedAt",
                            "type": "string",
                            "value": "={{ $json.updated_at }}"
                        },
                        {
                            "id": "2b4f9da6-f60e-46e0-ba9d-3242fa955a55",
                            "name": "storyId",
                            "type": "string",
                            "value": "={{ $json.story_id }}"
                        }
                    ]
                }
            },
            "typeVersion": 3.4
        },
        {
            "id": "16bc5628-8a29-4eac-8be9-b4e9da802e1e",
            "name": "Get latest post",
            "type": "n8n-nodes-base.filter",
            "position": [
                900,
                -220
            ],
            "parameters": {
                "options": {},
                "conditions": {
                    "options": {
                        "version": 2,
                        "leftValue": "",
                        "caseSensitive": true,
                        "typeValidation": "strict"
                    },
                    "combinator": "and",
                    "conditions": [
                        {
                            "id": "d7dd7175-2a50-45aa-bd3e-4c248c9193c4",
                            "operator": {
                                "type": "dateTime",
                                "operation": "after"
                            },
                            "leftValue": "={{ $json.createdAt }}",
                            "rightValue": "={{$now.minus({days: 30})}} "
                        }
                    ]
                }
            },
            "typeVersion": 2.2
        },
        {
            "id": "92e1ef74-5ae1-4195-840b-115184db464f",
            "name": "Split out children (jobs)",
            "type": "n8n-nodes-base.splitOut",
            "position": [
                1460,
                -220
            ],
            "parameters": {
                "options": {},
                "fieldToSplitOut": "kids"
            },
            "typeVersion": 1
        },
        {
            "id": "d0836aae-b98a-497f-a6f7-0ad563c262a0",
            "name": "Trun into structured data",
            "type": "@n8n/n8n-nodes-langchain.chainLlm",
            "position": [
                2600,
                -220
            ],
            "parameters": {
                "text": "={{ $json.cleaned_text }}",
                "messages": {
                    "messageValues": [
                        {
                            "message": "Extract the JSON data"
                        }
                    ]
                },
                "promptType": "define",
                "hasOutputParser": true
            },
            "typeVersion": 1.5
        },
        {
            "id": "fd818a93-627c-435d-91ba-5d759d5a9004",
            "name": "Sticky Note2",
            "type": "n8n-nodes-base.stickyNote",
            "position": [
                2600,
                -620
            ],
            "parameters": {
                "width": 840,
                "height": 340,
                "content": "## Data Structure\n\nWe use Openai GPT-4o-mini to transform the raw data in a unified data structure. Feel free to change this.\n\n```json\n{\n \"company\": \"Name of the hiring company\",\n \"title\": \"Job title/role being advertised\",\n \"location\": \"Work location including remote/hybrid status\",\n \"type\": \"Employment type (Full-time, Contract, etc)\",\n \"salary\": \"Compensation details if provided\",\n \"description\": \"Main job description text including requirements and team info\",\n \"apply_url\": \"Direct application/job posting URL\",\n \"company_url\": \"Company website or careers page\"\n}\n```"
            },
            "typeVersion": 1
        },
        {
            "id": "b70c5578-5b81-467a-8ac2-65374e4e52f3",
            "name": "Extract text",
            "type": "n8n-nodes-base.set",
            "position": [
                1860,
                -220
            ],
            "parameters": {
                "options": {},
                "assignments": {
                    "assignments": [
                        {
                            "id": "6affa370-56ce-4ad8-8534-8f753fdf07fc",
                            "name": "text",
                            "type": "string",
                            "value": "={{ $json.text }}"
                        }
                    ]
                }
            },
            "typeVersion": 3.4
        },
        {
            "id": "acb68d88-9417-42e9-9bcc-7c2fa95c4afd",
            "name": "Clean text",
            "type": "n8n-nodes-base.code",
            "position": [
                2060,
                -220
            ],
            "parameters": {
                "jsCode": "// In a Function node in n8n\nconst inputData = $input.all();\n\nfunction cleanAllPosts(data) {\n return data.map(item => {\n try {\n // Check if item exists and has the expected structure\n if (!item -- typeof item !== ''object'') {\n return { cleaned_text: '''', error: ''Invalid item structure'' };\n }\n\n // Get the text, with multiple fallbacks\n let text = '''';\n if (typeof item === ''string'') {\n text = item;\n } else if (item.json && item.json.text) {\n text = item.json.text;\n } else if (typeof item.json === ''string'') {\n text = item.json;\n } else {\n text = JSON.stringify(item);\n }\n\n // Make sure text is a string\n text = String(text);\n \n // Perform the cleaning operations\n try {\n text = text.replace(///g, ''/'');\n text = text.replace(/'/g, \"''\");\n text = text.replace(/&\\w+;/g, '' '');\n text = text.replace(/<[^>]*>/g, '''');\n text = text.replace(/\\-\\s*/g, ''- '');\n text = text.replace(/\\s+/g, '' '');\n text = text.replace(/\\s*(https?:\\/\\/[^\\s]+)\\s*/g, ''\\n$1\\n'');\n text = text.replace(/\\n{3,}/g, ''\\n\\n'');\n text = text.trim();\n } catch (cleaningError) {\n console.log(''Error during text cleaning:'', cleaningError);\n // Return original text if cleaning fails\n return { cleaned_text: text, warning: ''Partial cleaning applied'' };\n }\n\n return { cleaned_text: text };\n \n } catch (error) {\n console.log(''Error processing item:'', error);\n return { \n cleaned_text: '''', \n error: `Processing error: ${error.message}`,\n original: item\n };\n }\n }).filter(result => result.cleaned_text -- result.error); \n}\n\ntry {\n return cleanAllPosts(inputData);\n} catch (error) {\n console.log(''Fatal error:'', error);\n return [{ \n cleaned_text: '''', \n error: `Fatal error: ${error.message}`,\n input: inputData \n }];\n}\n"
            },
            "typeVersion": 2
        },
        {
            "id": "a0727b55-565d-47c0-9ab5-0f001f4b9941",
            "name": "Limit for testing (optional)",
            "type": "n8n-nodes-base.limit",
            "position": [
                2280,
                -220
            ],
            "parameters": {
                "maxItems": 5
            },
            "typeVersion": 1
        },
        {
            "id": "650baf5e-c2ac-443d-8a2b-6df89717186f",
            "name": "Sticky Note3",
            "type": "n8n-nodes-base.stickyNote",
            "position": [
                580,
                -620
            ],
            "parameters": {
                "width": 540,
                "height": 340,
                "content": "## Clean the result \n\n```json\n{\n\"title\": \"Ask HN: Who is hiring? (February 2025)\",\n\"createdAt\": \"2025-02-03T16:00:43Z\",\n\"updatedAt\": \"2025-02-17T08:35:44Z\",\n\"storyId\": \"42919502\"\n},\n{\n\"title\": \"Ask HN: Who is hiring? (January 2025)\",\n\"createdAt\": \"2025-01-02T16:00:09Z\",\n\"updatedAt\": \"2025-02-13T00:03:24Z\",\n\"storyId\": \"42575537\"\n},\n```"
            },
            "typeVersion": 1
        },
        {
            "id": "1ca5c39f-f21d-455a-b63a-702e7e3ba02b",
            "name": "Write results to airtable",
            "type": "n8n-nodes-base.airtable",
            "position": [
                3040,
                -220
            ],
            "parameters": {
                "base": {
                    "__rl": true,
                    "mode": "list",
                    "value": "appM2JWvA5AstsGdn",
                    "cachedResultUrl": "https://airtable.com/appM2JWvA5AstsGdn",
                    "cachedResultName": "HN Who is hiring?"
                },
                "table": {
                    "__rl": true,
                    "mode": "list",
                    "value": "tblGvcOjqbliwM7AS",
                    "cachedResultUrl": "https://airtable.com/appM2JWvA5AstsGdn/tblGvcOjqbliwM7AS",
                    "cachedResultName": "Table 1"
                },
                "columns": {
                    "value": {
                        "type": "={{ $json.output.type }}",
                        "title": "={{ $json.output.title }}",
                        "salary": "={{ $json.output.salary }}",
                        "company": "={{ $json.output.company }}",
                        "location": "={{ $json.output.location }}",
                        "apply_url": "={{ $json.output.apply_url }}",
                        "company_url": "={{ $json.output.company_url }}",
                        "description": "={{ $json.output.description }}"
                    },
                    "schema": [
                        {
                            "id": "title",
                            "type": "string",
                            "display": true,
                            "removed": false,
                            "readOnly": false,
                            "required": false,
                            "displayName": "title",
                            "defaultMatch": false,
                            "canBeUsedToMatch": true
                        },
                        {
                            "id": "company",
                            "type": "string",
                            "display": true,
                            "removed": false,
                            "readOnly": false,
                            "required": false,
                            "displayName": "company",
                            "defaultMatch": false,
                            "canBeUsedToMatch": true
                        },
                        {
                            "id": "location",
                            "type": "string",
                            "display": true,
                            "removed": false,
                            "readOnly": false,
                            "required": false,
                            "displayName": "location",
                            "defaultMatch": false,
                            "canBeUsedToMatch": true
                        },
                        {
                            "id": "type",
                            "type": "string",
                            "display": true,
                            "removed": false,
                            "readOnly": false,
                            "required": false,
                            "displayName": "type",
                            "defaultMatch": false,
                            "canBeUsedToMatch": true
                        },
                        {
                            "id": "salary",
                            "type": "string",
                            "display": true,
                            "removed": false,
                            "readOnly": false,
                            "required": false,
                            "displayName": "salary",
                            "defaultMatch": false,
                            "canBeUsedToMatch": true
                        },
                        {
                            "id": "description",
                            "type": "string",
                            "display": true,
                            "removed": false,
                            "readOnly": false,
                            "required": false,
                            "displayName": "description",
                            "defaultMatch": false,
                            "canBeUsedToMatch": true
                        },
                        {
                            "id": "apply_url",
                            "type": "string",
                            "display": true,
                            "removed": false,
                            "readOnly": false,
                            "required": false,
                            "displayName": "apply_url",
                            "defaultMatch": false,
                            "canBeUsedToMatch": true
                        },
                        {
                            "id": "company_url",
                            "type": "string",
                            "display": true,
                            "removed": false,
                            "readOnly": false,
                            "required": false,
                            "displayName": "company_url",
                            "defaultMatch": false,
                            "canBeUsedToMatch": true
                        },
                        {
                            "id": "posted_date",
                            "type": "string",
                            "display": true,
                            "removed": true,
                            "readOnly": false,
                            "required": false,
                            "displayName": "posted_date",
                            "defaultMatch": false,
                            "canBeUsedToMatch": true
                        }
                    ],
                    "mappingMode": "defineBelow",
                    "matchingColumns": [],
                    "attemptToConvertTypes": false,
                    "convertFieldsToString": false
                },
                "options": {},
                "operation": "create"
            },
            "credentials": {
                "airtableTokenApi": {
                    "id": "IudXLNj7CDuc5M5a",
                    "name": "Airtable Personal Access Token account"
                }
            },
            "typeVersion": 2.1
        },
        {
            "id": "d71fa024-86a0-4f74-b033-1f755574080c",
            "name": "Sticky Note4",
            "type": "n8n-nodes-base.stickyNote",
            "position": [
                -520,
                -300
            ],
            "parameters": {
                "width": 380,
                "height": 500,
                "content": "## Hacker News - Who is Hiring Scrape\n\nIn this template we setup a scraper for the monthly HN Who is Hiring post. This way we can scrape the data and transform it to a common data strcutre.\n\nFirst we use the [Algolia Search](https://hn.algolia.com/) provided by hackernews to drill down the results.\n\nWe can use the official [Hacker News API](https://github.com/HackerNews/API\n) to get the post data and also all the replies!\n\nThis will obviously work for any kind of post on hacker news! Get creative 😃\n\nAll you need is an Openai Account to structure the text data and an Airtable Account (or similar) to write the results to a list.\n\nCopy my base https://airtable.com/appM2JWvA5AstsGdn/shrAuo78cJt5C2laR"
            },
            "typeVersion": 1
        },
        {
            "id": "7466fb0c-9f0c-4adf-a6de-b2cf09032719",
            "name": "HI API: Get the individual job post",
            "type": "n8n-nodes-base.httpRequest",
            "position": [
                1660,
                -220
            ],
            "parameters": {
                "url": "=https://hacker-news.firebaseio.com/v0/item/{{ $json.kids }}.json?print=pretty",
                "options": {}
            },
            "typeVersion": 4.2
        },
        {
            "id": "184abccf-5838-49bf-9922-e0300c6b145e",
            "name": "HN API: Get Main Post",
            "type": "n8n-nodes-base.httpRequest",
            "position": [
                1260,
                -220
            ],
            "parameters": {
                "url": "=https://hacker-news.firebaseio.com/v0/item/{{ $json.storyId }}.json?print=pretty",
                "options": {}
            },
            "typeVersion": 4.2
        }
    ],
    "active": false,
    "pinData": {},
    "settings": {
        "executionOrder": "v1"
    },
    "versionId": "387f7084-58fa-4643-9351-73c870d3f028",
    "connections": {
        "Split Out": {
            "main": [
                [
                    {
                        "node": "Get relevant data",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "Clean text": {
            "main": [
                [
                    {
                        "node": "Limit for testing (optional)",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "Extract text": {
            "main": [
                [
                    {
                        "node": "Clean text",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "Get latest post": {
            "main": [
                [
                    {
                        "node": "HN API: Get Main Post",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "Get relevant data": {
            "main": [
                [
                    {
                        "node": "Get latest post",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "OpenAI Chat Model": {
            "ai_languageModel": [
                [
                    {
                        "node": "Trun into structured data",
                        "type": "ai_languageModel",
                        "index": 0
                    }
                ]
            ]
        },
        "HN API: Get Main Post": {
            "main": [
                [
                    {
                        "node": "Split out children (jobs)",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "Structured Output Parser": {
            "ai_outputParser": [
                [
                    {
                        "node": "Trun into structured data",
                        "type": "ai_outputParser",
                        "index": 0
                    }
                ]
            ]
        },
        "Split out children (jobs)": {
            "main": [
                [
                    {
                        "node": "HI API: Get the individual job post",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "Trun into structured data": {
            "main": [
                [
                    {
                        "node": "Write results to airtable",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "Limit for testing (optional)": {
            "main": [
                [
                    {
                        "node": "Trun into structured data",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "Search for Who is hiring posts": {
            "main": [
                [
                    {
                        "node": "Split Out",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "When clicking ‘Test workflow’": {
            "main": [
                [
                    {
                        "node": "Search for Who is hiring posts",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        },
        "HI API: Get the individual job post": {
            "main": [
                [
                    {
                        "node": "Extract text",
                        "type": "main",
                        "index": 0
                    }
                ]
            ]
        }
    }
}