Вычислить количество товаров в каждой подпапке
Имеется CSV со строками URL вида:
/cat1-name/sub-cat-1/item-1.html
/cat1-name/sub-cat-1/item-2.html
/cat1-name/sub-cat-1/item-3.html
/cat1-name/sub-cat-1/item-4.html
/cat1-name/sub-cat-1/item-5.html
/cat1-name/sub-cat-1/item-6.html
/cat1-name/sub-cat-1/item-7.html
/cat1-name/sub-cat-1/item-8.html
/cat1-name/sub-cat-1/item-9.html
/cat1-name/sub-cat-1/item-10.html
/cat1-name/sub-cat-1/item-11.html
/cat1-name/sub-cat-1/item-12.html
/cat1-name/sub-cat-1/item-13.html
/cat1-name/sub-cat-1/item-14.html
/cat1-name/sub-cat-1/item-15.html
/cat1-name/sub-cat-1/item-16.html
/cat1-name/sub-cat-1/item-17.html
/cat1-name/sub-cat-1/item-18.html
/cat1-name/sub-cat-1/item-19.html
/cat1-name/sub-cat-1/item-20.html
/cat1-name/sub-cat-1/item-21.html
/cat1-name/sub-cat-1/item-22.html
/cat1-name/sub-cat-1/item-23.html
/cat1-name/sub-cat-1/item-24.html
/cat1-name/sub-cat-1/item-25.html
/cat1-name/sub-cat-2/item-1.html
/cat1-name/sub-cat-2/item-2.html
/cat1-name/sub-cat-2/item-3.html
/cat1-name/sub-cat-2/item-4.html
/cat1-name/sub-cat-2/item-5.html
/cat1-name/sub-cat-2/item-6.html
/cat1-name/sub-cat-2/item-7.html
/cat1-name/sub-cat-2/item-8.html
/cat1-name/sub-cat-2/item-9.html
/cat1-name/sub-cat-2/item-10.html
/cat1-name/sub-cat-2/item-11.html
/cat1-name/sub-cat-2/item-12.html
/cat1-name/sub-cat-3/item-1.html
/cat1-name/sub-cat-3/item-2.html
/cat1-name/sub-cat-3/item-3.html
/cat1-name/sub-cat-3/item-4.html
/cat1-name/sub-cat-3/item-5.html
/cat1-name/sub-cat-3/item-6.html
/cat1-name/sub-cat-3/item-7.html
/cat1-name/sub-cat-3/item-8.html
/cat1-name/sub-cat-3/item-9.html
/cat1-name/sub-cat-3/item-10.html
/cat1-name/sub-cat-3/item-11.html
/cat1-name/sub-cat-3/item-12.html
/cat1-name/sub-cat-3/item-13.html
/cat1-name/sub-cat-3/item-14.html
/cat1-name/sub-cat-3/item-15.html
/cat1-name/sub-cat-3/item-16.html
/cat1-name/sub-cat-3/item-17.html
/cat1-name/sub-cat-3/item-18.html
/cat1-name/sub-cat-3/item-19.html
/cat1-name/sub-cat-3/item-20.html
/cat1-name/sub-cat-3/item-21.html
/cat1-name/sub-cat-3/item-22.html
/cat1-name/sub-cat-4/item-1.html
/cat1-name/sub-cat-4/item-2.html
/cat1-name/sub-cat-4/item-3.html
/cat1-name/sub-cat-4/item-4.html
/cat1-name/sub-cat-4/item-5.html
/cat1-name/sub-cat-4/item-6.html
/cat1-name/sub-cat-4/item-7.html
/cat1-name/sub-cat-4/item-8.html
/cat1-name/sub-cat-4/item-9.html
/cat1-name/sub-cat-4/item-10.html
/cat1-name/sub-cat-4/item-11.html
/cat1-name/sub-cat-4/item-12.html
/cat1-name/sub-cat-4/item-13.html
/cat1-name/sub-cat-4/item-14.html
/cat1-name/sub-cat-4/item-15.html
/cat1-name/sub-cat-4/item-16.html
/cat1-name/image-1.html
/cat1-name/image-2.html
/cat2-name/item-1.html
/cat2-name/item-2.html
/cat2-name/item-3.html
/cat2-name/item-4.html
/cat2-name/item-5.html
/cat2-name/item-6.html
/cat2-name/item-7.html
/cat2-name/item-8.html
/cat2-name/item-9.html
/cat2-name/item-10.html
/cat2-name/item-11.html
/cat2-name/item-12.html
/cat2-name/item-13.html
/cat2-name/item-14.html
/cat2-name/item-15.html
/cat2-name/item-16.html
/cat2-name/image-1.html
/cat2-name/image-2.html
и т.д. Строки отсортированы по имени категории, подкатегории и объекту
Мне нужно подсчитать:
общее количество item в категории; количество item в каждой подкатегории; количество image в каждой категории.
Задача тривиальная, но справиться пока не выходит.
Попытался начать с простого: для каждой категории создал объект класса, записал полный список URL, относящихся к категории, подсчитал общее количество item для каждой категории.
Далее попытался для каждой подкатегории подсчитать:
for cat in categoryList:
persubcat = 0
newSerList.clear()
exstr = ""
images = 0
for i in range(cat.totalItems):
if(exstr == ""):
exstr = cat.url[i]
persubcat += 1
else:
if(cat.url[i].split("/")[2] == exstr.split("/")[2]):
persubcat += 1
elif(cat.url[i].split("/")[2].split("/")[0] == "image"):
images += 1
elif(cat.url[i].split("/")[2].split("/")[0] == "item"):
persubcat += 1
elif(cat.url[i].split("/")[2] != exstr.split("/")[2]):
persubcat += 1
subcatLsit.append(persubcat)
cat.subcats = subcatList
subcatList.clear()
persubcat = 0
Выглядит ужасно, работает тоже (не работает). Прошу неравнодушных помочь. Спасибо
Ответы (2 шт):
что-то такое нужно получается? Я только отдельно image не обрабатывал
paths = ['/category-name/sub-cat-name/item-name',
'/category-name/sub-cat-name/item-name-1',
'/category-name/sub-cat-name-1/item-name',
'/category-name/sub-cat-name-1/item-name-1',
'/category-name/image',
'/category-name/image-1',
'/category-name-1/sub-cat-name/item-name',
'/category-name-1/sub-cat-name-1/item-name-1',
'/category-name-2/item',
'/category-name-2/item-1'
]
# собрать информацию в дерево
items = dict()
for path in paths:
components = path[1:].split('/')
if components[0] not in items:
items[components[0]] = dict()
if len(components) > 2:
if components[1] not in items[components[0]]:
items[components[0]][components[1]] = dict()
items[components[0]][components[1]][components[2]] = items[components[0]][components[1]].get(components[2], 0) + 1
else:
if '' not in items[components[0]]:
items[components[0]]['--root--'] = dict()
items[components[0]]['--root--'][components[1]] = items[components[0]]['--root--'].get(components[1], 0) + 1
# подчитать кол-во items по категориям
for category in items:
count = 0
for subcategory in items[category]:
for item in items[category][subcategory]:
count += items[category][subcategory][item]
print(category, count)
print("---")
# подсчитать кол-во items по подкатегориям:
counts = dict()
for category in items:
for subcategory in items[category]:
for item in items[category][subcategory]:
counts[subcategory] = counts.get(subcategory, 0) + items[category][subcategory][item]
for subcategory in counts:
print(subcategory, counts[subcategory])
Наверное, тут удобнее использовать Pandas:
import pandas as pd
s = '''/cat1-name/sub-cat-1/item-1.html
/cat1-name/sub-cat-1/item-2.html
/cat1-name/sub-cat-1/item-3.html
/cat1-name/sub-cat-1/item-4.html
/cat1-name/sub-cat-1/item-5.html
/cat1-name/sub-cat-1/item-6.html
/cat1-name/sub-cat-1/item-7.html
/cat1-name/sub-cat-1/item-8.html
/cat1-name/sub-cat-1/item-9.html
/cat1-name/sub-cat-1/item-10.html
/cat1-name/sub-cat-1/item-11.html
/cat1-name/sub-cat-1/item-12.html
/cat1-name/sub-cat-1/item-13.html
/cat1-name/sub-cat-1/item-14.html
/cat1-name/sub-cat-1/item-15.html
/cat1-name/sub-cat-1/item-16.html
/cat1-name/sub-cat-1/item-17.html
/cat1-name/sub-cat-1/item-18.html
/cat1-name/sub-cat-1/item-19.html
/cat1-name/sub-cat-1/item-20.html
/cat1-name/sub-cat-1/item-21.html
/cat1-name/sub-cat-1/item-22.html
/cat1-name/sub-cat-1/item-23.html
/cat1-name/sub-cat-1/item-24.html
/cat1-name/sub-cat-1/item-25.html
/cat1-name/sub-cat-2/item-1.html
/cat1-name/sub-cat-2/item-2.html
/cat1-name/sub-cat-2/item-3.html
/cat1-name/sub-cat-2/item-4.html
/cat1-name/sub-cat-2/item-5.html
/cat1-name/sub-cat-2/item-6.html
/cat1-name/sub-cat-2/item-7.html
/cat1-name/sub-cat-2/item-8.html
/cat1-name/sub-cat-2/item-9.html
/cat1-name/sub-cat-2/item-10.html
/cat1-name/sub-cat-2/item-11.html
/cat1-name/sub-cat-2/item-12.html
/cat1-name/sub-cat-3/item-1.html
/cat1-name/sub-cat-3/item-2.html
/cat1-name/sub-cat-3/item-3.html
/cat1-name/sub-cat-3/item-4.html
/cat1-name/sub-cat-3/item-5.html
/cat1-name/sub-cat-3/item-6.html
/cat1-name/sub-cat-3/item-7.html
/cat1-name/sub-cat-3/item-8.html
/cat1-name/sub-cat-3/item-9.html
/cat1-name/sub-cat-3/item-10.html
/cat1-name/sub-cat-3/item-11.html
/cat1-name/sub-cat-3/item-12.html
/cat1-name/sub-cat-3/item-13.html
/cat1-name/sub-cat-3/item-14.html
/cat1-name/sub-cat-3/item-15.html
/cat1-name/sub-cat-3/item-16.html
/cat1-name/sub-cat-3/item-17.html
/cat1-name/sub-cat-3/item-18.html
/cat1-name/sub-cat-3/item-19.html
/cat1-name/sub-cat-3/item-20.html
/cat1-name/sub-cat-3/item-21.html
/cat1-name/sub-cat-3/item-22.html
/cat1-name/sub-cat-4/item-1.html
/cat1-name/sub-cat-4/item-2.html
/cat1-name/sub-cat-4/item-3.html
/cat1-name/sub-cat-4/item-4.html
/cat1-name/sub-cat-4/item-5.html
/cat1-name/sub-cat-4/item-6.html
/cat1-name/sub-cat-4/item-7.html
/cat1-name/sub-cat-4/item-8.html
/cat1-name/sub-cat-4/item-9.html
/cat1-name/sub-cat-4/item-10.html
/cat1-name/sub-cat-4/item-11.html
/cat1-name/sub-cat-4/item-12.html
/cat1-name/sub-cat-4/item-13.html
/cat1-name/sub-cat-4/item-14.html
/cat1-name/sub-cat-4/item-15.html
/cat1-name/sub-cat-4/item-16.html
/cat1-name/image-1.html
/cat1-name/image-2.html
/cat2-name/item-1.html
/cat2-name/item-2.html
/cat2-name/item-3.html
/cat2-name/item-4.html
/cat2-name/item-5.html
/cat2-name/item-6.html
/cat2-name/item-7.html
/cat2-name/item-8.html
/cat2-name/item-9.html
/cat2-name/item-10.html
/cat2-name/item-11.html
/cat2-name/item-12.html
/cat2-name/item-13.html
/cat2-name/item-14.html
/cat2-name/item-15.html
/cat2-name/item-16.html
/cat2-name/image-1.html
/cat2-name/image-2.html'''.split('\n')
items = []
for it in s:
_, cat, *rest = it.split('/')
subcat = ''.join(rest[-2:-1])
item = rest[-1]
items.append((cat, subcat, item))
df = pd.DataFrame(items, columns=['category','subcategory','item'])
df.head()
Получается такой датафрейм:
category subcategory item
0 cat1-name sub-cat-1 item-1.html
1 cat1-name sub-cat-1 item-2.html
2 cat1-name sub-cat-1 item-3.html
3 cat1-name sub-cat-1 item-4.html
4 cat1-name sub-cat-1 item-5.html
...
Дальше всё просто:
print('общее количество item в категории:\n',
df[df['item'].str.contains('item')].groupby('category').size())
print('------')
print('количество item в каждой подкатегории:\n',
df[df['item'].str.contains('item')].groupby(['category','subcategory']).size())
print('------')
print('количество image в каждой категории:\n',
df[df['item'].str.contains('image')].groupby('category').size())
Вывод:
общее количество item в категории:
category
cat1-name 75
cat2-name 16
dtype: int64
------
количество item в каждой подкатегории:
category subcategory
cat1-name sub-cat-1 25
sub-cat-2 12
sub-cat-3 22
sub-cat-4 16
cat2-name 16
dtype: int64
------
количество image в каждой категории:
category
cat1-name 2
cat2-name 2
dtype: int64
Вывод если нужно можно сделать по-другому, главное, что всё работает и всё довольно просто выбирать и аггрегировать.